gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 120 {
 121   HOST_WIDE_INT lhs, rhs;
 122
 123   /* During the analysis phase, this function is called on arbitrary
 124      statements that might not have scalar results.  */
 125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 126     return scalar_type;
 127
 128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 129
 130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 131   if (assign)
 132     {
 133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 134       if (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 140         {
 141           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 142
 143           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144           if (rhs < lhs)
 145             scalar_type = rhs_type;
 146         }
 147     }
 148   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 149     {
 150       unsigned int i = 0;
 151       if (gimple_call_internal_p (call))
 152         {
 153           internal_fn ifn = gimple_call_internal_fn (call);
 154           if (internal_load_fn_p (ifn))
 155             /* For loads the LHS type does the trick.  */
 156             i = ~0U;
 157           else if (internal_store_fn_p (ifn))
 158             {
 159               /* For stores use the tyep of the stored value.  */
 160               i = internal_fn_stored_value_index (ifn);
 161               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 162               i = ~0U;
 163             }
 164           else if (internal_fn_mask_index (ifn) == 0)
 165             i = 1;
 166         }
 167       if (i < gimple_call_num_args (call))
 168         {
 169           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 170           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 171             {
 172               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 173               if (rhs < lhs)
 174                 scalar_type = rhs_type;
 175             }
 176         }
 177     }
 178
 179   return scalar_type;
 180 }
 181
 182
 183 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 184    tested at run-time.  Return TRUE if DDR was successfully inserted.
 185    Return false if versioning is not supported.  */
 186
 187 static opt_result
 188 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 189 {
 190   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 191
 192   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 193     return opt_result::failure_at (vect_location,
 194                                    "will not create alias checks, as"
 195                                    " --param vect-max-version-for-alias-checks"
 196                                    " == 0\n");
 197
 198   opt_result res
 199     = runtime_alias_check_p (ddr, loop,
 200                              optimize_loop_nest_for_speed_p (loop));
 201   if (!res)
 202     return res;
 203
 204   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 205   return opt_result::success ();
 206 }
 207
 208 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 209
 210 static void
 211 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 212 {
 213   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 214   for (unsigned int i = 0; i < checks.length(); ++i)
 215     if (checks[i] == value)
 216       return;
 217
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location,
 220                      "need run-time check that %T is nonzero\n",
 221                      value);
 222   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 223 }
 224
 225 /* Return true if we know that the order of vectorized DR_INFO_A and
 226    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 227    DR_INFO_B.  At least one of the accesses is a write.  */
 228
 229 static bool
 230 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 231 {
 232   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 233   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 234
 235   /* Single statements are always kept in their original order.  */
 236   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 237       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 238     return true;
 239
 240   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 241      emitted at the position of the first scalar load.
 242      Stores in a group are emitted at the position of the last scalar store.
 243      Compute that position and check whether the resulting order matches
 244      the current one.  */
 245   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 246   if (il_a)
 247     {
 248       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 249         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 250              s = DR_GROUP_NEXT_ELEMENT (s))
 251           il_a = get_later_stmt (il_a, s);
 252       else /* DR_IS_READ */
 253         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 254              s = DR_GROUP_NEXT_ELEMENT (s))
 255           if (get_later_stmt (il_a, s) == il_a)
 256             il_a = s;
 257     }
 258   else
 259     il_a = stmtinfo_a;
 260   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 261   if (il_b)
 262     {
 263       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 264         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 265              s = DR_GROUP_NEXT_ELEMENT (s))
 266           il_b = get_later_stmt (il_b, s);
 267       else /* DR_IS_READ */
 268         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 269              s = DR_GROUP_NEXT_ELEMENT (s))
 270           if (get_later_stmt (il_b, s) == il_b)
 271             il_b = s;
 272     }
 273   else
 274     il_b = stmtinfo_b;
 275   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 276   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 277 }
 278
 279 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 280    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 281    distances.  These distances are conservatively correct but they don't
 282    reflect a guaranteed dependence.
 283
 284    Return true if this function does all the work necessary to avoid
 285    an alias or false if the caller should use the dependence distances
 286    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 287    the depth of the loop described by LOOP_VINFO and the other arguments
 288    are as for vect_analyze_data_ref_dependence.  */
 289
 290 static bool
 291 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 292                                        loop_vec_info loop_vinfo,
 293                                        int loop_depth, unsigned int *max_vf)
 294 {
 295   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 296   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 297     {
 298       int dist = dist_v[loop_depth];
 299       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 300         {
 301           /* If the user asserted safelen >= DIST consecutive iterations
 302              can be executed concurrently, assume independence.
 303
 304              ??? An alternative would be to add the alias check even
 305              in this case, and vectorize the fallback loop with the
 306              maximum VF set to safelen.  However, if the user has
 307              explicitly given a length, it's less likely that that
 308              would be a win.  */
 309           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 310             {
 311               if ((unsigned int) loop->safelen < *max_vf)
 312                 *max_vf = loop->safelen;
 313               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 314               continue;
 315             }
 316
 317           /* For dependence distances of 2 or more, we have the option
 318              of limiting VF or checking for an alias at runtime.
 319              Prefer to check at runtime if we can, to avoid limiting
 320              the VF unnecessarily when the bases are in fact independent.
 321
 322              Note that the alias checks will be removed if the VF ends up
 323              being small enough.  */
 324           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 325           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 326           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 327                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 328                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 329         }
 330     }
 331   return true;
 332 }
 333
 334
 335 /* Function vect_analyze_data_ref_dependence.
 336
 337    FIXME: I needed to change the sense of the returned flag.
 338
 339    Return FALSE if there (might) exist a dependence between a memory-reference
 340    DRA and a memory-reference DRB.  When versioning for alias may check a
 341    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 342    the data dependence.  */
 343
 344 static opt_result
 345 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 346                                   loop_vec_info loop_vinfo,
 347                                   unsigned int *max_vf)
 348 {
 349   unsigned int i;
 350   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 351   struct data_reference *dra = DDR_A (ddr);
 352   struct data_reference *drb = DDR_B (ddr);
 353   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 354   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 355   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 356   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 357   lambda_vector dist_v;
 358   unsigned int loop_depth;
 359
 360   /* If user asserted safelen consecutive iterations can be
 361      executed concurrently, assume independence.  */
 362   auto apply_safelen = [&]()
 363     {
 364       if (loop->safelen >= 2)
 365         {
 366           if ((unsigned int) loop->safelen < *max_vf)
 367             *max_vf = loop->safelen;
 368           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 369           return true;
 370         }
 371       return false;
 372     };
 373
 374   /* In loop analysis all data references should be vectorizable.  */
 375   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 376       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 377     gcc_unreachable ();
 378
 379   /* Independent data accesses.  */
 380   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 381     return opt_result::success ();
 382
 383   if (dra == drb
 384       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 385     return opt_result::success ();
 386
 387   /* We do not have to consider dependences between accesses that belong
 388      to the same group, unless the stride could be smaller than the
 389      group size.  */
 390   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 391       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 392           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 393       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 394     return opt_result::success ();
 395
 396   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 397      least two scalar iterations, there is always also a true dependence.
 398      As the vectorizer does not re-order loads and stores we can ignore
 399      the anti-dependence if TBAA can disambiguate both DRs similar to the
 400      case with known negative distance anti-dependences (positive
 401      distance anti-dependences would violate TBAA constraints).  */
 402   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 403        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 404       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 405                                  get_alias_set (DR_REF (drb))))
 406     return opt_result::success ();
 407
 408   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 409       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 410     {
 411       if (apply_safelen ())
 412         return opt_result::success ();
 413
 414       return opt_result::failure_at
 415         (stmtinfo_a->stmt,
 416          "possible alias involving gather/scatter between %T and %T\n",
 417          DR_REF (dra), DR_REF (drb));
 418     }
 419
 420   /* Unknown data dependence.  */
 421   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 422     {
 423       if (apply_safelen ())
 424         return opt_result::success ();
 425
 426       if (dump_enabled_p ())
 427         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 428                          "versioning for alias required: "
 429                          "can't determine dependence between %T and %T\n",
 430                          DR_REF (dra), DR_REF (drb));
 431
 432       /* Add to list of ddrs that need to be tested at run-time.  */
 433       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 434     }
 435
 436   /* Known data dependence.  */
 437   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 438     {
 439       if (apply_safelen ())
 440         return opt_result::success ();
 441
 442       if (dump_enabled_p ())
 443         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 444                          "versioning for alias required: "
 445                          "bad dist vector for %T and %T\n",
 446                          DR_REF (dra), DR_REF (drb));
 447       /* Add to list of ddrs that need to be tested at run-time.  */
 448       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 449     }
 450
 451   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 452
 453   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 454       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 455                                                 loop_depth, max_vf))
 456     return opt_result::success ();
 457
 458   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 459     {
 460       int dist = dist_v[loop_depth];
 461
 462       if (dump_enabled_p ())
 463         dump_printf_loc (MSG_NOTE, vect_location,
 464                          "dependence distance  = %d.\n", dist);
 465
 466       if (dist == 0)
 467         {
 468           if (dump_enabled_p ())
 469             dump_printf_loc (MSG_NOTE, vect_location,
 470                              "dependence distance == 0 between %T and %T\n",
 471                              DR_REF (dra), DR_REF (drb));
 472
 473           /* When we perform grouped accesses and perform implicit CSE
 474              by detecting equal accesses and doing disambiguation with
 475              runtime alias tests like for
 476                 .. = a[i];
 477                 .. = a[i+1];
 478                 a[i] = ..;
 479                 a[i+1] = ..;
 480                 *p = ..;
 481                 .. = a[i];
 482                 .. = a[i+1];
 483              where we will end up loading { a[i], a[i+1] } once, make
 484              sure that inserting group loads before the first load and
 485              stores after the last store will do the right thing.
 486              Similar for groups like
 487                 a[i] = ...;
 488                 ... = a[i];
 489                 a[i+1] = ...;
 490              where loads from the group interleave with the store.  */
 491           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 492             return opt_result::failure_at (stmtinfo_a->stmt,
 493                                            "READ_WRITE dependence"
 494                                            " in interleaving.\n");
 495
 496           if (loop->safelen < 2)
 497             {
 498               tree indicator = dr_zero_step_indicator (dra);
 499               if (!indicator || integer_zerop (indicator))
 500                 return opt_result::failure_at (stmtinfo_a->stmt,
 501                                                "access also has a zero step\n");
 502               else if (TREE_CODE (indicator) != INTEGER_CST)
 503                 vect_check_nonzero_value (loop_vinfo, indicator);
 504             }
 505           continue;
 506         }
 507
 508       if (dist > 0 && DDR_REVERSED_P (ddr))
 509         {
 510           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 511              reversed (to make distance vector positive), and the actual
 512              distance is negative.  */
 513           if (dump_enabled_p ())
 514             dump_printf_loc (MSG_NOTE, vect_location,
 515                              "dependence distance negative.\n");
 516           /* When doing outer loop vectorization, we need to check if there is
 517              a backward dependence at the inner loop level if the dependence
 518              at the outer loop is reversed.  See PR81740.  */
 519           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 520               || nested_in_vect_loop_p (loop, stmtinfo_b))
 521             {
 522               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 523                                                          DDR_LOOP_NEST (ddr));
 524               if (dist_v[inner_depth] < 0)
 525                 return opt_result::failure_at (stmtinfo_a->stmt,
 526                                                "not vectorized, dependence "
 527                                                "between data-refs %T and %T\n",
 528                                                DR_REF (dra), DR_REF (drb));
 529             }
 530           /* Record a negative dependence distance to later limit the
 531              amount of stmt copying / unrolling we can perform.
 532              Only need to handle read-after-write dependence.  */
 533           if (DR_IS_READ (drb)
 534               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 535                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 536             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 537           continue;
 538         }
 539
 540       unsigned int abs_dist = abs (dist);
 541       if (abs_dist >= 2 && abs_dist < *max_vf)
 542         {
 543           /* The dependence distance requires reduction of the maximal
 544              vectorization factor.  */
 545           *max_vf = abs_dist;
 546           if (dump_enabled_p ())
 547             dump_printf_loc (MSG_NOTE, vect_location,
 548                              "adjusting maximal vectorization factor to %i\n",
 549                              *max_vf);
 550         }
 551
 552       if (abs_dist >= *max_vf)
 553         {
 554           /* Dependence distance does not create dependence, as far as
 555              vectorization is concerned, in this case.  */
 556           if (dump_enabled_p ())
 557             dump_printf_loc (MSG_NOTE, vect_location,
 558                              "dependence distance >= VF.\n");
 559           continue;
 560         }
 561
 562       return opt_result::failure_at (stmtinfo_a->stmt,
 563                                      "not vectorized, possible dependence "
 564                                      "between data-refs %T and %T\n",
 565                                      DR_REF (dra), DR_REF (drb));
 566     }
 567
 568   return opt_result::success ();
 569 }
 570
 571 /* Function vect_analyze_data_ref_dependences.
 572
 573    Examine all the data references in the loop, and make sure there do not
 574    exist any data dependences between them.  Set *MAX_VF according to
 575    the maximum vectorization factor the data dependences allow.  */
 576
 577 opt_result
 578 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 579                                    unsigned int *max_vf)
 580 {
 581   unsigned int i;
 582   struct data_dependence_relation *ddr;
 583
 584   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 585
 586   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 587     {
 588       LOOP_VINFO_DDRS (loop_vinfo)
 589         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 590                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 591       /* We do not need read-read dependences.  */
 592       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 593                                           &LOOP_VINFO_DDRS (loop_vinfo),
 594                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 595                                           false);
 596       gcc_assert (res);
 597     }
 598
 599   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 600
 601   /* For epilogues we either have no aliases or alias versioning
 602      was applied to original loop.  Therefore we may just get max_vf
 603      using VF of original loop.  */
 604   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 605     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 606   else
 607     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 608       {
 609         opt_result res
 610           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 611         if (!res)
 612           return res;
 613       }
 614
 615   return opt_result::success ();
 616 }
 617
 618
 619 /* Function vect_slp_analyze_data_ref_dependence.
 620
 621    Return TRUE if there (might) exist a dependence between a memory-reference
 622    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 623    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 624    according to the data dependence.  */
 625
 626 static bool
 627 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 628                                       struct data_dependence_relation *ddr)
 629 {
 630   struct data_reference *dra = DDR_A (ddr);
 631   struct data_reference *drb = DDR_B (ddr);
 632   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 633   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 634
 635   /* We need to check dependences of statements marked as unvectorizable
 636      as well, they still can prohibit vectorization.  */
 637
 638   /* Independent data accesses.  */
 639   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 640     return false;
 641
 642   if (dra == drb)
 643     return false;
 644
 645   /* Read-read is OK.  */
 646   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 647     return false;
 648
 649   /* If dra and drb are part of the same interleaving chain consider
 650      them independent.  */
 651   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 652       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 653           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 654     return false;
 655
 656   /* Unknown data dependence.  */
 657   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 658     {
 659       if  (dump_enabled_p ())
 660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 661                          "can't determine dependence between %T and %T\n",
 662                          DR_REF (dra), DR_REF (drb));
 663     }
 664   else if (dump_enabled_p ())
 665     dump_printf_loc (MSG_NOTE, vect_location,
 666                      "determined dependence between %T and %T\n",
 667                      DR_REF (dra), DR_REF (drb));
 668
 669   return true;
 670 }
 671
 672
 673 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 674    contain the vector of scalar stores of this instance if we are
 675    disambiguating the loads.  */
 676
 677 static bool
 678 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 679                                    vec<stmt_vec_info> stores,
 680                                    stmt_vec_info last_store_info)
 681 {
 682   /* This walks over all stmts involved in the SLP load/store done
 683      in NODE verifying we can sink them up to the last stmt in the
 684      group.  */
 685   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
 686     {
 687       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 688       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 689         {
 690           stmt_vec_info access_info
 691             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 692           if (access_info == last_access_info)
 693             continue;
 694           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 695           ao_ref ref;
 696           bool ref_initialized_p = false;
 697           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 698                gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 699             {
 700               gimple *stmt = gsi_stmt (gsi);
 701               if (! gimple_vuse (stmt))
 702                 continue;
 703
 704               /* If we couldn't record a (single) data reference for this
 705                  stmt we have to resort to the alias oracle.  */
 706               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 707               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 708               if (!dr_b)
 709                 {
 710                   /* We are moving a store - this means
 711                      we cannot use TBAA for disambiguation.  */
 712                   if (!ref_initialized_p)
 713                     ao_ref_init (&ref, DR_REF (dr_a));
 714                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 715                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 716                     return false;
 717                   continue;
 718                 }
 719
 720               bool dependent = false;
 721               /* If we run into a store of this same instance (we've just
 722                  marked those) then delay dependence checking until we run
 723                  into the last store because this is where it will have
 724                  been sunk to (and we verify if we can do that as well).  */
 725               if (gimple_visited_p (stmt))
 726                 {
 727                   if (stmt_info != last_store_info)
 728                     continue;
 729
 730                   for (stmt_vec_info &store_info : stores)
 731                     {
 732                       data_reference *store_dr
 733                         = STMT_VINFO_DATA_REF (store_info);
 734                       ddr_p ddr = initialize_data_dependence_relation
 735                                     (dr_a, store_dr, vNULL);
 736                       dependent
 737                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 738                       free_dependence_relation (ddr);
 739                       if (dependent)
 740                         break;
 741                     }
 742                 }
 743               else
 744                 {
 745                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 746                                                                    dr_b, vNULL);
 747                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 748                   free_dependence_relation (ddr);
 749                 }
 750               if (dependent)
 751                 return false;
 752             }
 753         }
 754     }
 755   else /* DR_IS_READ */
 756     {
 757       stmt_vec_info first_access_info
 758         = vect_find_first_scalar_stmt_in_slp (node);
 759       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 760         {
 761           stmt_vec_info access_info
 762             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 763           if (access_info == first_access_info)
 764             continue;
 765           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 766           ao_ref ref;
 767           bool ref_initialized_p = false;
 768           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 769                gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 770             {
 771               gimple *stmt = gsi_stmt (gsi);
 772               if (! gimple_vdef (stmt))
 773                 continue;
 774
 775               /* If we couldn't record a (single) data reference for this
 776                  stmt we have to resort to the alias oracle.  */
 777               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 778               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 779
 780               /* We are hoisting a load - this means we can use
 781                  TBAA for disambiguation.  */
 782               if (!ref_initialized_p)
 783                 ao_ref_init (&ref, DR_REF (dr_a));
 784               if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
 785                 {
 786                   if (!dr_b)
 787                     return false;
 788                   /* Resort to dependence checking below.  */
 789                 }
 790               else
 791                 /* No dependence.  */
 792                 continue;
 793
 794               bool dependent = false;
 795               /* If we run into a store of this same instance (we've just
 796                  marked those) then delay dependence checking until we run
 797                  into the last store because this is where it will have
 798                  been sunk to (and we verify if we can do that as well).  */
 799               if (gimple_visited_p (stmt))
 800                 {
 801                   if (stmt_info != last_store_info)
 802                     continue;
 803
 804                   for (stmt_vec_info &store_info : stores)
 805                     {
 806                       data_reference *store_dr
 807                         = STMT_VINFO_DATA_REF (store_info);
 808                       ddr_p ddr = initialize_data_dependence_relation
 809                                     (dr_a, store_dr, vNULL);
 810                       dependent
 811                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 812                       free_dependence_relation (ddr);
 813                       if (dependent)
 814                         break;
 815                     }
 816                 }
 817               else
 818                 {
 819                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 820                                                                    dr_b, vNULL);
 821                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 822                   free_dependence_relation (ddr);
 823                 }
 824               if (dependent)
 825                 return false;
 826             }
 827         }
 828     }
 829   return true;
 830 }
 831
 832
 833 /* Function vect_analyze_data_ref_dependences.
 834
 835    Examine all the data references in the basic-block, and make sure there
 836    do not exist any data dependences between them.  Set *MAX_VF according to
 837    the maximum vectorization factor the data dependences allow.  */
 838
 839 bool
 840 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 841 {
 842   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 843
 844   /* The stores of this instance are at the root of the SLP tree.  */
 845   slp_tree store = NULL;
 846   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
 847     store = SLP_INSTANCE_TREE (instance);
 848
 849   /* Verify we can sink stores to the vectorized stmt insert location.  */
 850   stmt_vec_info last_store_info = NULL;
 851   if (store)
 852     {
 853       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 854         return false;
 855
 856       /* Mark stores in this instance and remember the last one.  */
 857       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 858       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 859         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 860     }
 861
 862   bool res = true;
 863
 864   /* Verify we can sink loads to the vectorized stmt insert location,
 865      special-casing stores of this instance.  */
 866   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
 867     if (! vect_slp_analyze_node_dependences (vinfo, load,
 868                                              store
 869                                              ? SLP_TREE_SCALAR_STMTS (store)
 870                                              : vNULL, last_store_info))
 871       {
 872         res = false;
 873         break;
 874       }
 875
 876   /* Unset the visited flag.  */
 877   if (store)
 878     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 879       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 880
 881   return res;
 882 }
 883
 884 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
 885    applied.  */
 886
 887 int
 888 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
 889 {
 890   HOST_WIDE_INT diff = 0;
 891   /* Alignment is only analyzed for the first element of a DR group,
 892      use that but adjust misalignment by the offset of the access.  */
 893   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
 894     {
 895       dr_vec_info *first_dr
 896         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
 897       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
 898          INTEGER_CSTs and the first element in the group has the lowest
 899          address.  */
 900       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
 901               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
 902       gcc_assert (diff >= 0);
 903       dr_info = first_dr;
 904     }
 905
 906   int misalign = dr_info->misalignment;
 907   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
 908   if (misalign == DR_MISALIGNMENT_UNKNOWN)
 909     return misalign;
 910
 911   /* If the access is only aligned for a vector type with smaller alignment
 912      requirement the access has unknown misalignment.  */
 913   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
 914                 targetm.vectorize.preferred_vector_alignment (vectype)))
 915     return DR_MISALIGNMENT_UNKNOWN;
 916
 917   /* Apply the offset from the DR group start and the externally supplied
 918      offset which can for example result from a negative stride access.  */
 919   poly_int64 misalignment = misalign + diff + offset;
 920
 921   /* vect_compute_data_ref_alignment will have ensured that target_alignment
 922      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
 923   unsigned HOST_WIDE_INT target_alignment_c
 924     = dr_info->target_alignment.to_constant ();
 925   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
 926     return DR_MISALIGNMENT_UNKNOWN;
 927   return misalign;
 928 }
 929
 930 /* Record the base alignment guarantee given by DRB, which occurs
 931    in STMT_INFO.  */
 932
 933 static void
 934 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 935                             innermost_loop_behavior *drb)
 936 {
 937   bool existed;
 938   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
 939     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 940   if (!existed || entry.second->base_alignment < drb->base_alignment)
 941     {
 942       entry = std::make_pair (stmt_info, drb);
 943       if (dump_enabled_p ())
 944         dump_printf_loc (MSG_NOTE, vect_location,
 945                          "recording new base alignment for %T\n"
 946                          "  alignment:    %d\n"
 947                          "  misalignment: %d\n"
 948                          "  based on:     %G",
 949                          drb->base_address,
 950                          drb->base_alignment,
 951                          drb->base_misalignment,
 952                          stmt_info->stmt);
 953     }
 954 }
 955
 956 /* If the region we're going to vectorize is reached, all unconditional
 957    data references occur at least once.  We can therefore pool the base
 958    alignment guarantees from each unconditional reference.  Do this by
 959    going through all the data references in VINFO and checking whether
 960    the containing statement makes the reference unconditionally.  If so,
 961    record the alignment of the base address in VINFO so that it can be
 962    used for all other references with the same base.  */
 963
 964 void
 965 vect_record_base_alignments (vec_info *vinfo)
 966 {
 967   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 968   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 969   for (data_reference *dr : vinfo->shared->datarefs)
 970     {
 971       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 972       stmt_vec_info stmt_info = dr_info->stmt;
 973       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 974           && STMT_VINFO_VECTORIZABLE (stmt_info)
 975           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 976         {
 977           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 978
 979           /* If DR is nested in the loop that is being vectorized, we can also
 980              record the alignment of the base wrt the outer loop.  */
 981           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 982             vect_record_base_alignment
 983               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 984         }
 985     }
 986 }
 987
 988 /* Function vect_compute_data_ref_alignment
 989
 990    Compute the misalignment of the data reference DR_INFO when vectorizing
 991    with VECTYPE.
 992
 993    Output:
 994    1. initialized misalignment info for DR_INFO
 995
 996    FOR NOW: No analysis is actually performed. Misalignment is calculated
 997    only for trivial cases. TODO.  */
 998
 999 static void
1000 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1001                                  tree vectype)
1002 {
1003   stmt_vec_info stmt_info = dr_info->stmt;
1004   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1005   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1006   class loop *loop = NULL;
1007   tree ref = DR_REF (dr_info->dr);
1008
1009   if (dump_enabled_p ())
1010     dump_printf_loc (MSG_NOTE, vect_location,
1011                      "vect_compute_data_ref_alignment:\n");
1012
1013   if (loop_vinfo)
1014     loop = LOOP_VINFO_LOOP (loop_vinfo);
1015
1016   /* Initialize misalignment to unknown.  */
1017   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1018
1019   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1020     return;
1021
1022   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1023   bool step_preserves_misalignment_p;
1024
1025   poly_uint64 vector_alignment
1026     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1027                  BITS_PER_UNIT);
1028   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1029
1030   /* If the main loop has peeled for alignment we have no way of knowing
1031      whether the data accesses in the epilogues are aligned.  We can't at
1032      compile time answer the question whether we have entered the main loop or
1033      not.  Fixes PR 92351.  */
1034   if (loop_vinfo)
1035     {
1036       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1037       if (orig_loop_vinfo
1038           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1039         return;
1040     }
1041
1042   unsigned HOST_WIDE_INT vect_align_c;
1043   if (!vector_alignment.is_constant (&vect_align_c))
1044     return;
1045
1046   /* No step for BB vectorization.  */
1047   if (!loop)
1048     {
1049       gcc_assert (integer_zerop (drb->step));
1050       step_preserves_misalignment_p = true;
1051     }
1052
1053   /* In case the dataref is in an inner-loop of the loop that is being
1054      vectorized (LOOP), we use the base and misalignment information
1055      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1056      stays the same throughout the execution of the inner-loop, which is why
1057      we have to check that the stride of the dataref in the inner-loop evenly
1058      divides by the vector alignment.  */
1059   else if (nested_in_vect_loop_p (loop, stmt_info))
1060     {
1061       step_preserves_misalignment_p
1062         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1063
1064       if (dump_enabled_p ())
1065         {
1066           if (step_preserves_misalignment_p)
1067             dump_printf_loc (MSG_NOTE, vect_location,
1068                              "inner step divides the vector alignment.\n");
1069           else
1070             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071                              "inner step doesn't divide the vector"
1072                              " alignment.\n");
1073         }
1074     }
1075
1076   /* Similarly we can only use base and misalignment information relative to
1077      an innermost loop if the misalignment stays the same throughout the
1078      execution of the loop.  As above, this is the case if the stride of
1079      the dataref evenly divides by the alignment.  */
1080   else
1081     {
1082       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1083       step_preserves_misalignment_p
1084         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1085
1086       if (!step_preserves_misalignment_p && dump_enabled_p ())
1087         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1088                          "step doesn't divide the vector alignment.\n");
1089     }
1090
1091   unsigned int base_alignment = drb->base_alignment;
1092   unsigned int base_misalignment = drb->base_misalignment;
1093
1094   /* Calculate the maximum of the pooled base address alignment and the
1095      alignment that we can compute for DR itself.  */
1096   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1097     = base_alignments->get (drb->base_address);
1098   if (entry
1099       && base_alignment < (*entry).second->base_alignment
1100       && (loop_vinfo
1101           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1102                               gimple_bb (entry->first->stmt))
1103               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1104                   || (entry->first->dr_aux.group <= dr_info->group)))))
1105     {
1106       base_alignment = entry->second->base_alignment;
1107       base_misalignment = entry->second->base_misalignment;
1108     }
1109
1110   if (drb->offset_alignment < vect_align_c
1111       || !step_preserves_misalignment_p
1112       /* We need to know whether the step wrt the vectorized loop is
1113          negative when computing the starting misalignment below.  */
1114       || TREE_CODE (drb->step) != INTEGER_CST)
1115     {
1116       if (dump_enabled_p ())
1117         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118                          "Unknown alignment for access: %T\n", ref);
1119       return;
1120     }
1121
1122   if (base_alignment < vect_align_c)
1123     {
1124       unsigned int max_alignment;
1125       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1126       if (max_alignment < vect_align_c
1127           || !vect_can_force_dr_alignment_p (base,
1128                                              vect_align_c * BITS_PER_UNIT))
1129         {
1130           if (dump_enabled_p ())
1131             dump_printf_loc (MSG_NOTE, vect_location,
1132                              "can't force alignment of ref: %T\n", ref);
1133           return;
1134         }
1135
1136       /* Force the alignment of the decl.
1137          NOTE: This is the only change to the code we make during
1138          the analysis phase, before deciding to vectorize the loop.  */
1139       if (dump_enabled_p ())
1140         dump_printf_loc (MSG_NOTE, vect_location,
1141                          "force alignment of %T\n", ref);
1142
1143       dr_info->base_decl = base;
1144       dr_info->base_misaligned = true;
1145       base_misalignment = 0;
1146     }
1147   poly_int64 misalignment
1148     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1149
1150   unsigned int const_misalignment;
1151   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1152     {
1153       if (dump_enabled_p ())
1154         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1155                          "Non-constant misalignment for access: %T\n", ref);
1156       return;
1157     }
1158
1159   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1160
1161   if (dump_enabled_p ())
1162     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1163                      "misalign = %d bytes of ref %T\n",
1164                      const_misalignment, ref);
1165
1166   return;
1167 }
1168
1169 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1170    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1171    is made aligned via peeling.  */
1172
1173 static bool
1174 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1175                                          dr_vec_info *dr_peel_info)
1176 {
1177   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1178                   DR_TARGET_ALIGNMENT (dr_info)))
1179     {
1180       poly_offset_int diff
1181         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1182            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1183       if (known_eq (diff, 0)
1184           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1185         return true;
1186     }
1187   return false;
1188 }
1189
1190 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1191    aligned via peeling.  */
1192
1193 static bool
1194 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1195                                  dr_vec_info *dr_peel_info)
1196 {
1197   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1198                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1199       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1200                            DR_OFFSET (dr_peel_info->dr), 0)
1201       || !operand_equal_p (DR_STEP (dr_info->dr),
1202                            DR_STEP (dr_peel_info->dr), 0))
1203     return false;
1204
1205   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1206 }
1207
1208 /* Compute the value for dr_info->misalign so that the access appears
1209    aligned.  This is used by peeling to compensate for dr_misalignment
1210    applying the offset for negative step.  */
1211
1212 int
1213 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1214 {
1215   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1216     return 0;
1217
1218   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1219   poly_int64 misalignment
1220     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1221        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1222
1223   unsigned HOST_WIDE_INT target_alignment_c;
1224   int misalign;
1225   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1226       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1227     return DR_MISALIGNMENT_UNKNOWN;
1228   return misalign;
1229 }
1230
1231 /* Function vect_update_misalignment_for_peel.
1232    Sets DR_INFO's misalignment
1233    - to 0 if it has the same alignment as DR_PEEL_INFO,
1234    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1235    - to -1 (unknown) otherwise.
1236
1237    DR_INFO - the data reference whose misalignment is to be adjusted.
1238    DR_PEEL_INFO - the data reference whose misalignment is being made
1239                   zero in the vector loop by the peel.
1240    NPEEL - the number of iterations in the peel loop if the misalignment
1241            of DR_PEEL_INFO is known at compile time.  */
1242
1243 static void
1244 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1245                                    dr_vec_info *dr_peel_info, int npeel)
1246 {
1247   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1248   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1249     {
1250       SET_DR_MISALIGNMENT (dr_info,
1251                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1252       return;
1253     }
1254
1255   unsigned HOST_WIDE_INT alignment;
1256   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1257       && known_alignment_for_access_p (dr_info,
1258                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1259       && known_alignment_for_access_p (dr_peel_info,
1260                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1261     {
1262       int misal = dr_info->misalignment;
1263       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1264       misal &= alignment - 1;
1265       set_dr_misalignment (dr_info, misal);
1266       return;
1267     }
1268
1269   if (dump_enabled_p ())
1270     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1271                      "to unknown (-1).\n");
1272   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1273 }
1274
1275 /* Return true if alignment is relevant for DR_INFO.  */
1276
1277 static bool
1278 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1279 {
1280   stmt_vec_info stmt_info = dr_info->stmt;
1281
1282   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1283     return false;
1284
1285   /* For interleaving, only the alignment of the first access matters.  */
1286   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1287       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1288     return false;
1289
1290   /* Scatter-gather and invariant accesses continue to address individual
1291      scalars, so vector-level alignment is irrelevant.  */
1292   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1293       || integer_zerop (DR_STEP (dr_info->dr)))
1294     return false;
1295
1296   /* Strided accesses perform only component accesses, alignment is
1297      irrelevant for them.  */
1298   if (STMT_VINFO_STRIDED_P (stmt_info)
1299       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1300     return false;
1301
1302   return true;
1303 }
1304
1305 /* Given an memory reference EXP return whether its alignment is less
1306    than its size.  */
1307
1308 static bool
1309 not_size_aligned (tree exp)
1310 {
1311   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1312     return true;
1313
1314   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1315           > get_object_alignment (exp));
1316 }
1317
1318 /* Function vector_alignment_reachable_p
1319
1320    Return true if vector alignment for DR_INFO is reachable by peeling
1321    a few loop iterations.  Return false otherwise.  */
1322
1323 static bool
1324 vector_alignment_reachable_p (dr_vec_info *dr_info)
1325 {
1326   stmt_vec_info stmt_info = dr_info->stmt;
1327   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1328
1329   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1330     {
1331       /* For interleaved access we peel only if number of iterations in
1332          the prolog loop ({VF - misalignment}), is a multiple of the
1333          number of the interleaved accesses.  */
1334       int elem_size, mis_in_elements;
1335
1336       /* FORNOW: handle only known alignment.  */
1337       if (!known_alignment_for_access_p (dr_info, vectype))
1338         return false;
1339
1340       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1341       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1342       elem_size = vector_element_size (vector_size, nelements);
1343       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1344
1345       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1346         return false;
1347     }
1348
1349   /* If misalignment is known at the compile time then allow peeling
1350      only if natural alignment is reachable through peeling.  */
1351   if (known_alignment_for_access_p (dr_info, vectype)
1352       && !aligned_access_p (dr_info, vectype))
1353     {
1354       HOST_WIDE_INT elmsize =
1355                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1356       if (dump_enabled_p ())
1357         {
1358           dump_printf_loc (MSG_NOTE, vect_location,
1359                            "data size = %wd. misalignment = %d.\n", elmsize,
1360                            dr_misalignment (dr_info, vectype));
1361         }
1362       if (dr_misalignment (dr_info, vectype) % elmsize)
1363         {
1364           if (dump_enabled_p ())
1365             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1366                              "data size does not divide the misalignment.\n");
1367           return false;
1368         }
1369     }
1370
1371   if (!known_alignment_for_access_p (dr_info, vectype))
1372     {
1373       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1374       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1375       if (dump_enabled_p ())
1376         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377                          "Unknown misalignment, %snaturally aligned\n",
1378                          is_packed ? "not " : "");
1379       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1380     }
1381
1382   return true;
1383 }
1384
1385
1386 /* Calculate the cost of the memory access represented by DR_INFO.  */
1387
1388 static void
1389 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1390                            dr_alignment_support alignment_support_scheme,
1391                            int misalignment,
1392                            unsigned int *inside_cost,
1393                            unsigned int *outside_cost,
1394                            stmt_vector_for_cost *body_cost_vec,
1395                            stmt_vector_for_cost *prologue_cost_vec)
1396 {
1397   stmt_vec_info stmt_info = dr_info->stmt;
1398   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1399   int ncopies;
1400
1401   if (PURE_SLP_STMT (stmt_info))
1402     ncopies = 1;
1403   else
1404     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1405
1406   if (DR_IS_READ (dr_info->dr))
1407     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1408                         misalignment, true, inside_cost,
1409                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1410   else
1411     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1412                          misalignment, inside_cost, body_cost_vec);
1413
1414   if (dump_enabled_p ())
1415     dump_printf_loc (MSG_NOTE, vect_location,
1416                      "vect_get_data_access_cost: inside_cost = %d, "
1417                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1418 }
1419
1420
1421 typedef struct _vect_peel_info
1422 {
1423   dr_vec_info *dr_info;
1424   int npeel;
1425   unsigned int count;
1426 } *vect_peel_info;
1427
1428 typedef struct _vect_peel_extended_info
1429 {
1430   vec_info *vinfo;
1431   struct _vect_peel_info peel_info;
1432   unsigned int inside_cost;
1433   unsigned int outside_cost;
1434 } *vect_peel_extended_info;
1435
1436
1437 /* Peeling hashtable helpers.  */
1438
1439 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1440 {
1441   static inline hashval_t hash (const _vect_peel_info *);
1442   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1443 };
1444
1445 inline hashval_t
1446 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1447 {
1448   return (hashval_t) peel_info->npeel;
1449 }
1450
1451 inline bool
1452 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1453 {
1454   return (a->npeel == b->npeel);
1455 }
1456
1457
1458 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1459
1460 static void
1461 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1462                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1463                           int npeel, bool supportable_if_not_aligned)
1464 {
1465   struct _vect_peel_info elem, *slot;
1466   _vect_peel_info **new_slot;
1467
1468   elem.npeel = npeel;
1469   slot = peeling_htab->find (&elem);
1470   if (slot)
1471     slot->count++;
1472   else
1473     {
1474       slot = XNEW (struct _vect_peel_info);
1475       slot->npeel = npeel;
1476       slot->dr_info = dr_info;
1477       slot->count = 1;
1478       new_slot = peeling_htab->find_slot (slot, INSERT);
1479       *new_slot = slot;
1480     }
1481
1482   /* If this DR is not supported with unknown misalignment then bias
1483      this slot when the cost model is disabled.  */
1484   if (!supportable_if_not_aligned
1485       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1486     slot->count += VECT_MAX_COST;
1487 }
1488
1489
1490 /* Traverse peeling hash table to find peeling option that aligns maximum
1491    number of data accesses.  */
1492
1493 int
1494 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1495                                      _vect_peel_extended_info *max)
1496 {
1497   vect_peel_info elem = *slot;
1498
1499   if (elem->count > max->peel_info.count
1500       || (elem->count == max->peel_info.count
1501           && max->peel_info.npeel > elem->npeel))
1502     {
1503       max->peel_info.npeel = elem->npeel;
1504       max->peel_info.count = elem->count;
1505       max->peel_info.dr_info = elem->dr_info;
1506     }
1507
1508   return 1;
1509 }
1510
1511 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1512    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1513    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1514    after peeling.  */
1515
1516 static void
1517 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1518                                 dr_vec_info *dr0_info,
1519                                 unsigned int *inside_cost,
1520                                 unsigned int *outside_cost,
1521                                 stmt_vector_for_cost *body_cost_vec,
1522                                 stmt_vector_for_cost *prologue_cost_vec,
1523                                 unsigned int npeel)
1524 {
1525   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1526
1527   bool dr0_alignment_known_p
1528     = (dr0_info
1529        && known_alignment_for_access_p (dr0_info,
1530                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1531
1532   for (data_reference *dr : datarefs)
1533     {
1534       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1535       if (!vect_relevant_for_alignment_p (dr_info))
1536         continue;
1537
1538       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1539       dr_alignment_support alignment_support_scheme;
1540       int misalignment;
1541       unsigned HOST_WIDE_INT alignment;
1542
1543       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1544                                             size_zero_node) < 0;
1545       poly_int64 off = 0;
1546       if (negative)
1547         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1548                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1549
1550       if (npeel == 0)
1551         misalignment = dr_misalignment (dr_info, vectype, off);
1552       else if (dr_info == dr0_info
1553                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1554         misalignment = 0;
1555       else if (!dr0_alignment_known_p
1556                || !known_alignment_for_access_p (dr_info, vectype)
1557                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1558         misalignment = DR_MISALIGNMENT_UNKNOWN;
1559       else
1560         {
1561           misalignment = dr_misalignment (dr_info, vectype, off);
1562           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1563           misalignment &= alignment - 1;
1564         }
1565       alignment_support_scheme
1566         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1567                                          misalignment);
1568
1569       vect_get_data_access_cost (loop_vinfo, dr_info,
1570                                  alignment_support_scheme, misalignment,
1571                                  inside_cost, outside_cost,
1572                                  body_cost_vec, prologue_cost_vec);
1573     }
1574 }
1575
1576 /* Traverse peeling hash table and calculate cost for each peeling option.
1577    Find the one with the lowest cost.  */
1578
1579 int
1580 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1581                                    _vect_peel_extended_info *min)
1582 {
1583   vect_peel_info elem = *slot;
1584   int dummy;
1585   unsigned int inside_cost = 0, outside_cost = 0;
1586   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1587   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1588                        epilogue_cost_vec;
1589
1590   prologue_cost_vec.create (2);
1591   body_cost_vec.create (2);
1592   epilogue_cost_vec.create (2);
1593
1594   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1595                                   &outside_cost, &body_cost_vec,
1596                                   &prologue_cost_vec, elem->npeel);
1597
1598   body_cost_vec.release ();
1599
1600   outside_cost += vect_get_known_peeling_cost
1601     (loop_vinfo, elem->npeel, &dummy,
1602      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1603      &prologue_cost_vec, &epilogue_cost_vec);
1604
1605   /* Prologue and epilogue costs are added to the target model later.
1606      These costs depend only on the scalar iteration cost, the
1607      number of peeling iterations finally chosen, and the number of
1608      misaligned statements.  So discard the information found here.  */
1609   prologue_cost_vec.release ();
1610   epilogue_cost_vec.release ();
1611
1612   if (inside_cost < min->inside_cost
1613       || (inside_cost == min->inside_cost
1614           && outside_cost < min->outside_cost))
1615     {
1616       min->inside_cost = inside_cost;
1617       min->outside_cost = outside_cost;
1618       min->peel_info.dr_info = elem->dr_info;
1619       min->peel_info.npeel = elem->npeel;
1620       min->peel_info.count = elem->count;
1621     }
1622
1623   return 1;
1624 }
1625
1626
1627 /* Choose best peeling option by traversing peeling hash table and either
1628    choosing an option with the lowest cost (if cost model is enabled) or the
1629    option that aligns as many accesses as possible.  */
1630
1631 static struct _vect_peel_extended_info
1632 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1633                                        loop_vec_info loop_vinfo)
1634 {
1635    struct _vect_peel_extended_info res;
1636
1637    res.peel_info.dr_info = NULL;
1638    res.vinfo = loop_vinfo;
1639
1640    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1641      {
1642        res.inside_cost = INT_MAX;
1643        res.outside_cost = INT_MAX;
1644        peeling_htab->traverse <_vect_peel_extended_info *,
1645                                vect_peeling_hash_get_lowest_cost> (&res);
1646      }
1647    else
1648      {
1649        res.peel_info.count = 0;
1650        peeling_htab->traverse <_vect_peel_extended_info *,
1651                                vect_peeling_hash_get_most_frequent> (&res);
1652        res.inside_cost = 0;
1653        res.outside_cost = 0;
1654      }
1655
1656    return res;
1657 }
1658
1659 /* Return true if the new peeling NPEEL is supported.  */
1660
1661 static bool
1662 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1663                           unsigned npeel)
1664 {
1665   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1666   enum dr_alignment_support supportable_dr_alignment;
1667
1668   bool dr0_alignment_known_p
1669     = known_alignment_for_access_p (dr0_info,
1670                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1671
1672   /* Ensure that all data refs can be vectorized after the peel.  */
1673   for (data_reference *dr : datarefs)
1674     {
1675       if (dr == dr0_info->dr)
1676         continue;
1677
1678       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1679       if (!vect_relevant_for_alignment_p (dr_info)
1680           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1681         continue;
1682
1683       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1684       int misalignment;
1685       unsigned HOST_WIDE_INT alignment;
1686       if (!dr0_alignment_known_p
1687           || !known_alignment_for_access_p (dr_info, vectype)
1688           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1689         misalignment = DR_MISALIGNMENT_UNKNOWN;
1690       else
1691         {
1692           misalignment = dr_misalignment (dr_info, vectype);
1693           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1694           misalignment &= alignment - 1;
1695         }
1696       supportable_dr_alignment
1697         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1698                                          misalignment);
1699       if (supportable_dr_alignment == dr_unaligned_unsupported)
1700         return false;
1701     }
1702
1703   return true;
1704 }
1705
1706 /* Compare two data-references DRA and DRB to group them into chunks
1707    with related alignment.  */
1708
1709 static int
1710 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1711 {
1712   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1713   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1714   int cmp;
1715
1716   /* Stabilize sort.  */
1717   if (dra == drb)
1718     return 0;
1719
1720   /* Ordering of DRs according to base.  */
1721   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1722                                DR_BASE_ADDRESS (drb));
1723   if (cmp != 0)
1724     return cmp;
1725
1726   /* And according to DR_OFFSET.  */
1727   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1728   if (cmp != 0)
1729     return cmp;
1730
1731   /* And after step.  */
1732   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1733   if (cmp != 0)
1734     return cmp;
1735
1736   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1737   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1738   if (cmp == 0)
1739     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1740   return cmp;
1741 }
1742
1743 /* Function vect_enhance_data_refs_alignment
1744
1745    This pass will use loop versioning and loop peeling in order to enhance
1746    the alignment of data references in the loop.
1747
1748    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1749    original loop is to be vectorized.  Any other loops that are created by
1750    the transformations performed in this pass - are not supposed to be
1751    vectorized.  This restriction will be relaxed.
1752
1753    This pass will require a cost model to guide it whether to apply peeling
1754    or versioning or a combination of the two.  For example, the scheme that
1755    intel uses when given a loop with several memory accesses, is as follows:
1756    choose one memory access ('p') which alignment you want to force by doing
1757    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1758    other accesses are not necessarily aligned, or (2) use loop versioning to
1759    generate one loop in which all accesses are aligned, and another loop in
1760    which only 'p' is necessarily aligned.
1761
1762    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1763    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1764    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1765
1766    Devising a cost model is the most critical aspect of this work.  It will
1767    guide us on which access to peel for, whether to use loop versioning, how
1768    many versions to create, etc.  The cost model will probably consist of
1769    generic considerations as well as target specific considerations (on
1770    powerpc for example, misaligned stores are more painful than misaligned
1771    loads).
1772
1773    Here are the general steps involved in alignment enhancements:
1774
1775      -- original loop, before alignment analysis:
1776         for (i=0; i<N; i++){
1777           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1778           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1779         }
1780
1781      -- After vect_compute_data_refs_alignment:
1782         for (i=0; i<N; i++){
1783           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1784           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1785         }
1786
1787      -- Possibility 1: we do loop versioning:
1788      if (p is aligned) {
1789         for (i=0; i<N; i++){    # loop 1A
1790           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1791           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1792         }
1793      }
1794      else {
1795         for (i=0; i<N; i++){    # loop 1B
1796           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1797           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1798         }
1799      }
1800
1801      -- Possibility 2: we do loop peeling:
1802      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1803         x = q[i];
1804         p[i] = y;
1805      }
1806      for (i = 3; i < N; i++){   # loop 2A
1807         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1808         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1809      }
1810
1811      -- Possibility 3: combination of loop peeling and versioning:
1812      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1813         x = q[i];
1814         p[i] = y;
1815      }
1816      if (p is aligned) {
1817         for (i = 3; i<N; i++){  # loop 3A
1818           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1819           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1820         }
1821      }
1822      else {
1823         for (i = 3; i<N; i++){  # loop 3B
1824           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1825           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1826         }
1827      }
1828
1829      These loops are later passed to loop_transform to be vectorized.  The
1830      vectorizer will use the alignment information to guide the transformation
1831      (whether to generate regular loads/stores, or with special handling for
1832      misalignment).  */
1833
1834 opt_result
1835 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1836 {
1837   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838   dr_vec_info *first_store = NULL;
1839   dr_vec_info *dr0_info = NULL;
1840   struct data_reference *dr;
1841   unsigned int i;
1842   bool do_peeling = false;
1843   bool do_versioning = false;
1844   unsigned int npeel = 0;
1845   bool one_misalignment_known = false;
1846   bool one_misalignment_unknown = false;
1847   bool one_dr_unsupportable = false;
1848   dr_vec_info *unsupportable_dr_info = NULL;
1849   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1850   hash_table<peel_info_hasher> peeling_htab (1);
1851
1852   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1853
1854   /* Reset data so we can safely be called multiple times.  */
1855   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1856   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1857
1858   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1859     return opt_result::success ();
1860
1861   /* Sort the vector of datarefs so DRs that have the same or dependent
1862      alignment are next to each other.  */
1863   auto_vec<data_reference_p> datarefs
1864     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1865   datarefs.qsort (dr_align_group_sort_cmp);
1866
1867   /* Compute the number of DRs that become aligned when we peel
1868      a dataref so it becomes aligned.  */
1869   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1870   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1871   unsigned i0;
1872   for (i0 = 0; i0 < datarefs.length (); ++i0)
1873     if (DR_BASE_ADDRESS (datarefs[i0]))
1874       break;
1875   for (i = i0 + 1; i <= datarefs.length (); ++i)
1876     {
1877       if (i == datarefs.length ()
1878           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1879                                DR_BASE_ADDRESS (datarefs[i]), 0)
1880           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1881                                DR_OFFSET (datarefs[i]), 0)
1882           || !operand_equal_p (DR_STEP (datarefs[i0]),
1883                                DR_STEP (datarefs[i]), 0))
1884         {
1885           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1886              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1887              will get known misalignment if we align one of the refs
1888              with the largest DR_TARGET_ALIGNMENT.  */
1889           for (unsigned j = i0; j < i; ++j)
1890             {
1891               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1892               for (unsigned k = i0; k < i; ++k)
1893                 {
1894                   if (k == j)
1895                     continue;
1896                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1897                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1898                                                                dr_infoj))
1899                     n_same_align_refs[j]++;
1900                 }
1901             }
1902           i0 = i;
1903         }
1904     }
1905
1906   /* While cost model enhancements are expected in the future, the high level
1907      view of the code at this time is as follows:
1908
1909      A) If there is a misaligned access then see if peeling to align
1910         this access can make all data references satisfy
1911         vect_supportable_dr_alignment.  If so, update data structures
1912         as needed and return true.
1913
1914      B) If peeling wasn't possible and there is a data reference with an
1915         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1916         then see if loop versioning checks can be used to make all data
1917         references satisfy vect_supportable_dr_alignment.  If so, update
1918         data structures as needed and return true.
1919
1920      C) If neither peeling nor versioning were successful then return false if
1921         any data reference does not satisfy vect_supportable_dr_alignment.
1922
1923      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1924
1925      Note, Possibility 3 above (which is peeling and versioning together) is not
1926      being done at this time.  */
1927
1928   /* (1) Peeling to force alignment.  */
1929
1930   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1931      Considerations:
1932      + How many accesses will become aligned due to the peeling
1933      - How many accesses will become unaligned due to the peeling,
1934        and the cost of misaligned accesses.
1935      - The cost of peeling (the extra runtime checks, the increase
1936        in code size).  */
1937
1938   FOR_EACH_VEC_ELT (datarefs, i, dr)
1939     {
1940       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1941       if (!vect_relevant_for_alignment_p (dr_info))
1942         continue;
1943
1944       stmt_vec_info stmt_info = dr_info->stmt;
1945       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1946       do_peeling = vector_alignment_reachable_p (dr_info);
1947       if (do_peeling)
1948         {
1949           if (known_alignment_for_access_p (dr_info, vectype))
1950             {
1951               unsigned int npeel_tmp = 0;
1952               bool negative = tree_int_cst_compare (DR_STEP (dr),
1953                                                     size_zero_node) < 0;
1954
1955               /* If known_alignment_for_access_p then we have set
1956                  DR_MISALIGNMENT which is only done if we know it at compiler
1957                  time, so it is safe to assume target alignment is constant.
1958                */
1959               unsigned int target_align =
1960                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1961               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1962               poly_int64 off = 0;
1963               if (negative)
1964                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1965               unsigned int mis = dr_misalignment (dr_info, vectype, off);
1966               mis = negative ? mis : -mis;
1967               if (mis != 0)
1968                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1969
1970               /* For multiple types, it is possible that the bigger type access
1971                  will have more than one peeling option.  E.g., a loop with two
1972                  types: one of size (vector size / 4), and the other one of
1973                  size (vector size / 8).  Vectorization factor will 8.  If both
1974                  accesses are misaligned by 3, the first one needs one scalar
1975                  iteration to be aligned, and the second one needs 5.  But the
1976                  first one will be aligned also by peeling 5 scalar
1977                  iterations, and in that case both accesses will be aligned.
1978                  Hence, except for the immediate peeling amount, we also want
1979                  to try to add full vector size, while we don't exceed
1980                  vectorization factor.
1981                  We do this automatically for cost model, since we calculate
1982                  cost for every peeling option.  */
1983               poly_uint64 nscalars = npeel_tmp;
1984               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1985                 {
1986                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1987                   nscalars = (STMT_SLP_TYPE (stmt_info)
1988                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1989                 }
1990
1991               /* Save info about DR in the hash table.  Also include peeling
1992                  amounts according to the explanation above.  Indicate
1993                  the alignment status when the ref is not aligned.
1994                  ???  Rather than using unknown alignment here we should
1995                  prune all entries from the peeling hashtable which cause
1996                  DRs to be not supported.  */
1997               bool supportable_if_not_aligned
1998                 = vect_supportable_dr_alignment
1999                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2000               while (known_le (npeel_tmp, nscalars))
2001                 {
2002                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2003                                             dr_info, npeel_tmp,
2004                                             supportable_if_not_aligned);
2005                   npeel_tmp += MAX (1, target_align / dr_size);
2006                 }
2007
2008               one_misalignment_known = true;
2009             }
2010           else
2011             {
2012               /* If we don't know any misalignment values, we prefer
2013                  peeling for data-ref that has the maximum number of data-refs
2014                  with the same alignment, unless the target prefers to align
2015                  stores over load.  */
2016               unsigned same_align_drs = n_same_align_refs[i];
2017               if (!dr0_info
2018                   || dr0_same_align_drs < same_align_drs)
2019                 {
2020                   dr0_same_align_drs = same_align_drs;
2021                   dr0_info = dr_info;
2022                 }
2023               /* For data-refs with the same number of related
2024                  accesses prefer the one where the misalign
2025                  computation will be invariant in the outermost loop.  */
2026               else if (dr0_same_align_drs == same_align_drs)
2027                 {
2028                   class loop *ivloop0, *ivloop;
2029                   ivloop0 = outermost_invariant_loop_for_expr
2030                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2031                   ivloop = outermost_invariant_loop_for_expr
2032                     (loop, DR_BASE_ADDRESS (dr));
2033                   if ((ivloop && !ivloop0)
2034                       || (ivloop && ivloop0
2035                           && flow_loop_nested_p (ivloop, ivloop0)))
2036                     dr0_info = dr_info;
2037                 }
2038
2039               one_misalignment_unknown = true;
2040
2041               /* Check for data refs with unsupportable alignment that
2042                  can be peeled.  */
2043               enum dr_alignment_support supportable_dr_alignment
2044                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2045                                                  DR_MISALIGNMENT_UNKNOWN);
2046               if (supportable_dr_alignment == dr_unaligned_unsupported)
2047                 {
2048                   one_dr_unsupportable = true;
2049                   unsupportable_dr_info = dr_info;
2050                 }
2051
2052               if (!first_store && DR_IS_WRITE (dr))
2053                 {
2054                   first_store = dr_info;
2055                   first_store_same_align_drs = same_align_drs;
2056                 }
2057             }
2058         }
2059       else
2060         {
2061           if (!aligned_access_p (dr_info, vectype))
2062             {
2063               if (dump_enabled_p ())
2064                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2065                                  "vector alignment may not be reachable\n");
2066               break;
2067             }
2068         }
2069     }
2070
2071   /* Check if we can possibly peel the loop.  */
2072   if (!vect_can_advance_ivs_p (loop_vinfo)
2073       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2074       || loop->inner)
2075     do_peeling = false;
2076
2077   struct _vect_peel_extended_info peel_for_known_alignment;
2078   struct _vect_peel_extended_info peel_for_unknown_alignment;
2079   struct _vect_peel_extended_info best_peel;
2080
2081   peel_for_unknown_alignment.inside_cost = INT_MAX;
2082   peel_for_unknown_alignment.outside_cost = INT_MAX;
2083   peel_for_unknown_alignment.peel_info.count = 0;
2084
2085   if (do_peeling
2086       && one_misalignment_unknown)
2087     {
2088       /* Check if the target requires to prefer stores over loads, i.e., if
2089          misaligned stores are more expensive than misaligned loads (taking
2090          drs with same alignment into account).  */
2091       unsigned int load_inside_cost = 0;
2092       unsigned int load_outside_cost = 0;
2093       unsigned int store_inside_cost = 0;
2094       unsigned int store_outside_cost = 0;
2095       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2096
2097       stmt_vector_for_cost dummy;
2098       dummy.create (2);
2099       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2100                                       &load_inside_cost,
2101                                       &load_outside_cost,
2102                                       &dummy, &dummy, estimated_npeels);
2103       dummy.release ();
2104
2105       if (first_store)
2106         {
2107           dummy.create (2);
2108           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2109                                           &store_inside_cost,
2110                                           &store_outside_cost,
2111                                           &dummy, &dummy,
2112                                           estimated_npeels);
2113           dummy.release ();
2114         }
2115       else
2116         {
2117           store_inside_cost = INT_MAX;
2118           store_outside_cost = INT_MAX;
2119         }
2120
2121       if (load_inside_cost > store_inside_cost
2122           || (load_inside_cost == store_inside_cost
2123               && load_outside_cost > store_outside_cost))
2124         {
2125           dr0_info = first_store;
2126           dr0_same_align_drs = first_store_same_align_drs;
2127           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2128           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2129         }
2130       else
2131         {
2132           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2133           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2134         }
2135
2136       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2137       prologue_cost_vec.create (2);
2138       epilogue_cost_vec.create (2);
2139
2140       int dummy2;
2141       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2142         (loop_vinfo, estimated_npeels, &dummy2,
2143          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2144          &prologue_cost_vec, &epilogue_cost_vec);
2145
2146       prologue_cost_vec.release ();
2147       epilogue_cost_vec.release ();
2148
2149       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2150     }
2151
2152   peel_for_unknown_alignment.peel_info.npeel = 0;
2153   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2154
2155   best_peel = peel_for_unknown_alignment;
2156
2157   peel_for_known_alignment.inside_cost = INT_MAX;
2158   peel_for_known_alignment.outside_cost = INT_MAX;
2159   peel_for_known_alignment.peel_info.count = 0;
2160   peel_for_known_alignment.peel_info.dr_info = NULL;
2161
2162   if (do_peeling && one_misalignment_known)
2163     {
2164       /* Peeling is possible, but there is no data access that is not supported
2165          unless aligned.  So we try to choose the best possible peeling from
2166          the hash table.  */
2167       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2168         (&peeling_htab, loop_vinfo);
2169     }
2170
2171   /* Compare costs of peeling for known and unknown alignment. */
2172   if (peel_for_known_alignment.peel_info.dr_info != NULL
2173       && peel_for_unknown_alignment.inside_cost
2174       >= peel_for_known_alignment.inside_cost)
2175     {
2176       best_peel = peel_for_known_alignment;
2177
2178       /* If the best peeling for known alignment has NPEEL == 0, perform no
2179          peeling at all except if there is an unsupportable dr that we can
2180          align.  */
2181       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2182         do_peeling = false;
2183     }
2184
2185   /* If there is an unsupportable data ref, prefer this over all choices so far
2186      since we'd have to discard a chosen peeling except when it accidentally
2187      aligned the unsupportable data ref.  */
2188   if (one_dr_unsupportable)
2189     dr0_info = unsupportable_dr_info;
2190   else if (do_peeling)
2191     {
2192       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2193          TODO: Use nopeel_outside_cost or get rid of it?  */
2194       unsigned nopeel_inside_cost = 0;
2195       unsigned nopeel_outside_cost = 0;
2196
2197       stmt_vector_for_cost dummy;
2198       dummy.create (2);
2199       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2200                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2201       dummy.release ();
2202
2203       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2204          costs will be recorded.  */
2205       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2206       prologue_cost_vec.create (2);
2207       epilogue_cost_vec.create (2);
2208
2209       int dummy2;
2210       nopeel_outside_cost += vect_get_known_peeling_cost
2211         (loop_vinfo, 0, &dummy2,
2212          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2213          &prologue_cost_vec, &epilogue_cost_vec);
2214
2215       prologue_cost_vec.release ();
2216       epilogue_cost_vec.release ();
2217
2218       npeel = best_peel.peel_info.npeel;
2219       dr0_info = best_peel.peel_info.dr_info;
2220
2221       /* If no peeling is not more expensive than the best peeling we
2222          have so far, don't perform any peeling.  */
2223       if (nopeel_inside_cost <= best_peel.inside_cost)
2224         do_peeling = false;
2225     }
2226
2227   if (do_peeling)
2228     {
2229       stmt_vec_info stmt_info = dr0_info->stmt;
2230       if (known_alignment_for_access_p (dr0_info,
2231                                         STMT_VINFO_VECTYPE (stmt_info)))
2232         {
2233           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2234                                                 size_zero_node) < 0;
2235           if (!npeel)
2236             {
2237               /* Since it's known at compile time, compute the number of
2238                  iterations in the peeled loop (the peeling factor) for use in
2239                  updating DR_MISALIGNMENT values.  The peeling factor is the
2240                  vectorization factor minus the misalignment as an element
2241                  count.  */
2242               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2243               poly_int64 off = 0;
2244               if (negative)
2245                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2246                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2247               unsigned int mis
2248                 = dr_misalignment (dr0_info, vectype, off);
2249               mis = negative ? mis : -mis;
2250               /* If known_alignment_for_access_p then we have set
2251                  DR_MISALIGNMENT which is only done if we know it at compiler
2252                  time, so it is safe to assume target alignment is constant.
2253                */
2254               unsigned int target_align =
2255                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2256               npeel = ((mis & (target_align - 1))
2257                        / vect_get_scalar_dr_size (dr0_info));
2258             }
2259
2260           /* For interleaved data access every iteration accesses all the
2261              members of the group, therefore we divide the number of iterations
2262              by the group size.  */
2263           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2264             npeel /= DR_GROUP_SIZE (stmt_info);
2265
2266           if (dump_enabled_p ())
2267             dump_printf_loc (MSG_NOTE, vect_location,
2268                              "Try peeling by %d\n", npeel);
2269         }
2270
2271       /* Ensure that all datarefs can be vectorized after the peel.  */
2272       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2273         do_peeling = false;
2274
2275       /* Check if all datarefs are supportable and log.  */
2276       if (do_peeling
2277           && npeel == 0
2278           && known_alignment_for_access_p (dr0_info,
2279                                            STMT_VINFO_VECTYPE (stmt_info)))
2280         return opt_result::success ();
2281
2282       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2283       if (do_peeling)
2284         {
2285           unsigned max_allowed_peel
2286             = param_vect_max_peeling_for_alignment;
2287           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2288             max_allowed_peel = 0;
2289           if (max_allowed_peel != (unsigned)-1)
2290             {
2291               unsigned max_peel = npeel;
2292               if (max_peel == 0)
2293                 {
2294                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2295                   unsigned HOST_WIDE_INT target_align_c;
2296                   if (target_align.is_constant (&target_align_c))
2297                     max_peel =
2298                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2299                   else
2300                     {
2301                       do_peeling = false;
2302                       if (dump_enabled_p ())
2303                         dump_printf_loc (MSG_NOTE, vect_location,
2304                           "Disable peeling, max peels set and vector"
2305                           " alignment unknown\n");
2306                     }
2307                 }
2308               if (max_peel > max_allowed_peel)
2309                 {
2310                   do_peeling = false;
2311                   if (dump_enabled_p ())
2312                     dump_printf_loc (MSG_NOTE, vect_location,
2313                         "Disable peeling, max peels reached: %d\n", max_peel);
2314                 }
2315             }
2316         }
2317
2318       /* Cost model #2 - if peeling may result in a remaining loop not
2319          iterating enough to be vectorized then do not peel.  Since this
2320          is a cost heuristic rather than a correctness decision, use the
2321          most likely runtime value for variable vectorization factors.  */
2322       if (do_peeling
2323           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2324         {
2325           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2326           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2327           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2328               < assumed_vf + max_peel)
2329             do_peeling = false;
2330         }
2331
2332       if (do_peeling)
2333         {
2334           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2335              If the misalignment of DR_i is identical to that of dr0 then set
2336              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2337              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2338              by the peeling factor times the element size of DR_i (MOD the
2339              vectorization factor times the size).  Otherwise, the
2340              misalignment of DR_i must be set to unknown.  */
2341           FOR_EACH_VEC_ELT (datarefs, i, dr)
2342             if (dr != dr0_info->dr)
2343               {
2344                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2345                 if (!vect_relevant_for_alignment_p (dr_info))
2346                   continue;
2347
2348                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2349               }
2350
2351           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2352           if (npeel)
2353             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2354           else
2355             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2356           SET_DR_MISALIGNMENT (dr0_info,
2357                                vect_dr_misalign_for_aligned_access (dr0_info));
2358           if (dump_enabled_p ())
2359             {
2360               dump_printf_loc (MSG_NOTE, vect_location,
2361                                "Alignment of access forced using peeling.\n");
2362               dump_printf_loc (MSG_NOTE, vect_location,
2363                                "Peeling for alignment will be applied.\n");
2364             }
2365
2366           /* The inside-loop cost will be accounted for in vectorizable_load
2367              and vectorizable_store correctly with adjusted alignments.
2368              Drop the body_cst_vec on the floor here.  */
2369           return opt_result::success ();
2370         }
2371     }
2372
2373   /* (2) Versioning to force alignment.  */
2374
2375   /* Try versioning if:
2376      1) optimize loop for speed and the cost-model is not cheap
2377      2) there is at least one unsupported misaligned data ref with an unknown
2378         misalignment, and
2379      3) all misaligned data refs with a known misalignment are supported, and
2380      4) the number of runtime alignment checks is within reason.  */
2381
2382   do_versioning
2383     = (optimize_loop_nest_for_speed_p (loop)
2384        && !loop->inner /* FORNOW */
2385        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2386
2387   if (do_versioning)
2388     {
2389       FOR_EACH_VEC_ELT (datarefs, i, dr)
2390         {
2391           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2392           if (!vect_relevant_for_alignment_p (dr_info))
2393             continue;
2394
2395           stmt_vec_info stmt_info = dr_info->stmt;
2396           if (STMT_VINFO_STRIDED_P (stmt_info))
2397             {
2398               do_versioning = false;
2399               break;
2400             }
2401
2402           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2403           bool negative = tree_int_cst_compare (DR_STEP (dr),
2404                                                 size_zero_node) < 0;
2405           poly_int64 off = 0;
2406           if (negative)
2407             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2408                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2409           int misalignment;
2410           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2411             continue;
2412
2413           enum dr_alignment_support supportable_dr_alignment
2414             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2415                                              misalignment);
2416           if (supportable_dr_alignment == dr_unaligned_unsupported)
2417             {
2418               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2419                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2420                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2421                 {
2422                   do_versioning = false;
2423                   break;
2424                 }
2425
2426               /* At present we don't support versioning for alignment
2427                  with variable VF, since there's no guarantee that the
2428                  VF is a power of two.  We could relax this if we added
2429                  a way of enforcing a power-of-two size.  */
2430               unsigned HOST_WIDE_INT size;
2431               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2432                 {
2433                   do_versioning = false;
2434                   break;
2435                 }
2436
2437               /* Forcing alignment in the first iteration is no good if
2438                  we don't keep it across iterations.  For now, just disable
2439                  versioning in this case.
2440                  ?? We could actually unroll the loop to achieve the required
2441                  overall step alignment, and forcing the alignment could be
2442                  done by doing some iterations of the non-vectorized loop.  */
2443               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2444                                * DR_STEP_ALIGNMENT (dr),
2445                                DR_TARGET_ALIGNMENT (dr_info)))
2446                 {
2447                   do_versioning = false;
2448                   break;
2449                 }
2450
2451               /* The rightmost bits of an aligned address must be zeros.
2452                  Construct the mask needed for this test.  For example,
2453                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2454                  mask must be 15 = 0xf. */
2455               int mask = size - 1;
2456
2457               /* FORNOW: use the same mask to test all potentially unaligned
2458                  references in the loop.  */
2459               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2460                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2461                 {
2462                   do_versioning = false;
2463                   break;
2464                 }
2465
2466               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2467               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2468             }
2469         }
2470
2471       /* Versioning requires at least one misaligned data reference.  */
2472       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2473         do_versioning = false;
2474       else if (!do_versioning)
2475         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2476     }
2477
2478   if (do_versioning)
2479     {
2480       const vec<stmt_vec_info> &may_misalign_stmts
2481         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2482       stmt_vec_info stmt_info;
2483
2484       /* It can now be assumed that the data references in the statements
2485          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2486          of the loop being vectorized.  */
2487       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2488         {
2489           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2490           SET_DR_MISALIGNMENT (dr_info,
2491                                vect_dr_misalign_for_aligned_access (dr_info));
2492           if (dump_enabled_p ())
2493             dump_printf_loc (MSG_NOTE, vect_location,
2494                              "Alignment of access forced using versioning.\n");
2495         }
2496
2497       if (dump_enabled_p ())
2498         dump_printf_loc (MSG_NOTE, vect_location,
2499                          "Versioning for alignment will be applied.\n");
2500
2501       /* Peeling and versioning can't be done together at this time.  */
2502       gcc_assert (! (do_peeling && do_versioning));
2503
2504       return opt_result::success ();
2505     }
2506
2507   /* This point is reached if neither peeling nor versioning is being done.  */
2508   gcc_assert (! (do_peeling || do_versioning));
2509
2510   return opt_result::success ();
2511 }
2512
2513
2514 /* Function vect_analyze_data_refs_alignment
2515
2516    Analyze the alignment of the data-references in the loop.
2517    Return FALSE if a data reference is found that cannot be vectorized.  */
2518
2519 opt_result
2520 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2521 {
2522   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2523
2524   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2525   struct data_reference *dr;
2526   unsigned int i;
2527
2528   vect_record_base_alignments (loop_vinfo);
2529   FOR_EACH_VEC_ELT (datarefs, i, dr)
2530     {
2531       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2532       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2533         {
2534           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2535               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2536             continue;
2537           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2538                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2539         }
2540     }
2541
2542   return opt_result::success ();
2543 }
2544
2545
2546 /* Analyze alignment of DRs of stmts in NODE.  */
2547
2548 static bool
2549 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2550 {
2551   /* Alignment is maintained in the first element of the group.  */
2552   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2553   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2554   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2555   tree vectype = SLP_TREE_VECTYPE (node);
2556   poly_uint64 vector_alignment
2557     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2558                  BITS_PER_UNIT);
2559   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2560     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2561   /* Re-analyze alignment when we're facing a vectorization with a bigger
2562      alignment requirement.  */
2563   else if (known_lt (dr_info->target_alignment, vector_alignment))
2564     {
2565       poly_uint64 old_target_alignment = dr_info->target_alignment;
2566       int old_misalignment = dr_info->misalignment;
2567       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2568       /* But keep knowledge about a smaller alignment.  */
2569       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2570           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2571         {
2572           dr_info->target_alignment = old_target_alignment;
2573           dr_info->misalignment = old_misalignment;
2574         }
2575     }
2576   /* When we ever face unordered target alignments the first one wins in terms
2577      of analyzing and the other will become unknown in dr_misalignment.  */
2578   return true;
2579 }
2580
2581 /* Function vect_slp_analyze_instance_alignment
2582
2583    Analyze the alignment of the data-references in the SLP instance.
2584    Return FALSE if a data reference is found that cannot be vectorized.  */
2585
2586 bool
2587 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2588                                                 slp_instance instance)
2589 {
2590   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2591
2592   slp_tree node;
2593   unsigned i;
2594   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2595     if (! vect_slp_analyze_node_alignment (vinfo, node))
2596       return false;
2597
2598   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2599       && ! vect_slp_analyze_node_alignment
2600              (vinfo, SLP_INSTANCE_TREE (instance)))
2601     return false;
2602
2603   return true;
2604 }
2605
2606
2607 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2608    accesses of legal size, step, etc.  Detect gaps, single element
2609    interleaving, and other special cases. Set grouped access info.
2610    Collect groups of strided stores for further use in SLP analysis.
2611    Worker for vect_analyze_group_access.  */
2612
2613 static bool
2614 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2615 {
2616   data_reference *dr = dr_info->dr;
2617   tree step = DR_STEP (dr);
2618   tree scalar_type = TREE_TYPE (DR_REF (dr));
2619   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2620   stmt_vec_info stmt_info = dr_info->stmt;
2621   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2622   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2623   HOST_WIDE_INT dr_step = -1;
2624   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2625   bool slp_impossible = false;
2626
2627   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2628      size of the interleaving group (including gaps).  */
2629   if (tree_fits_shwi_p (step))
2630     {
2631       dr_step = tree_to_shwi (step);
2632       /* Check that STEP is a multiple of type size.  Otherwise there is
2633          a non-element-sized gap at the end of the group which we
2634          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2635          ???  As we can handle non-constant step fine here we should
2636          simply remove uses of DR_GROUP_GAP between the last and first
2637          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2638          simply not include that gap.  */
2639       if ((dr_step % type_size) != 0)
2640         {
2641           if (dump_enabled_p ())
2642             dump_printf_loc (MSG_NOTE, vect_location,
2643                              "Step %T is not a multiple of the element size"
2644                              " for %T\n",
2645                              step, DR_REF (dr));
2646           return false;
2647         }
2648       groupsize = absu_hwi (dr_step) / type_size;
2649     }
2650   else
2651     groupsize = 0;
2652
2653   /* Not consecutive access is possible only if it is a part of interleaving.  */
2654   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2655     {
2656       /* Check if it this DR is a part of interleaving, and is a single
2657          element of the group that is accessed in the loop.  */
2658
2659       /* Gaps are supported only for loads. STEP must be a multiple of the type
2660          size.  */
2661       if (DR_IS_READ (dr)
2662           && (dr_step % type_size) == 0
2663           && groupsize > 0
2664           /* This could be UINT_MAX but as we are generating code in a very
2665              inefficient way we have to cap earlier.
2666              See PR91403 for example.  */
2667           && groupsize <= 4096)
2668         {
2669           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2670           DR_GROUP_SIZE (stmt_info) = groupsize;
2671           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2672           if (dump_enabled_p ())
2673             dump_printf_loc (MSG_NOTE, vect_location,
2674                              "Detected single element interleaving %T"
2675                              " step %T\n",
2676                              DR_REF (dr), step);
2677
2678           return true;
2679         }
2680
2681       if (dump_enabled_p ())
2682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2683                          "not consecutive access %G", stmt_info->stmt);
2684
2685       if (bb_vinfo)
2686         {
2687           /* Mark the statement as unvectorizable.  */
2688           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2689           return true;
2690         }
2691
2692       if (dump_enabled_p ())
2693         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2694       STMT_VINFO_STRIDED_P (stmt_info) = true;
2695       return true;
2696     }
2697
2698   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2699     {
2700       /* First stmt in the interleaving chain. Check the chain.  */
2701       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2702       struct data_reference *data_ref = dr;
2703       unsigned int count = 1;
2704       tree prev_init = DR_INIT (data_ref);
2705       HOST_WIDE_INT diff, gaps = 0;
2706
2707       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2708       while (next)
2709         {
2710           /* We never have the same DR multiple times.  */
2711           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2712                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2713
2714           data_ref = STMT_VINFO_DATA_REF (next);
2715
2716           /* All group members have the same STEP by construction.  */
2717           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2718
2719           /* Check that the distance between two accesses is equal to the type
2720              size. Otherwise, we have gaps.  */
2721           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2722                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2723           if (diff < 1 || diff > UINT_MAX)
2724             {
2725               /* For artificial testcases with array accesses with large
2726                  constant indices we can run into overflow issues which
2727                  can end up fooling the groupsize constraint below so
2728                  check the individual gaps (which are represented as
2729                  unsigned int) as well.  */
2730               if (dump_enabled_p ())
2731                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2732                                  "interleaved access with gap larger "
2733                                  "than representable\n");
2734               return false;
2735             }
2736           if (diff != 1)
2737             {
2738               /* FORNOW: SLP of accesses with gaps is not supported.  */
2739               slp_impossible = true;
2740               if (DR_IS_WRITE (data_ref))
2741                 {
2742                   if (dump_enabled_p ())
2743                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2744                                      "interleaved store with gaps\n");
2745                   return false;
2746                 }
2747
2748               gaps += diff - 1;
2749             }
2750
2751           last_accessed_element += diff;
2752
2753           /* Store the gap from the previous member of the group. If there is no
2754              gap in the access, DR_GROUP_GAP is always 1.  */
2755           DR_GROUP_GAP (next) = diff;
2756
2757           prev_init = DR_INIT (data_ref);
2758           next = DR_GROUP_NEXT_ELEMENT (next);
2759           /* Count the number of data-refs in the chain.  */
2760           count++;
2761         }
2762
2763       if (groupsize == 0)
2764         groupsize = count + gaps;
2765
2766       /* This could be UINT_MAX but as we are generating code in a very
2767          inefficient way we have to cap earlier.  See PR78699 for example.  */
2768       if (groupsize > 4096)
2769         {
2770           if (dump_enabled_p ())
2771             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2772                              "group is too large\n");
2773           return false;
2774         }
2775
2776       /* Check that the size of the interleaving is equal to count for stores,
2777          i.e., that there are no gaps.  */
2778       if (groupsize != count
2779           && !DR_IS_READ (dr))
2780         {
2781           groupsize = count;
2782           STMT_VINFO_STRIDED_P (stmt_info) = true;
2783         }
2784
2785       /* If there is a gap after the last load in the group it is the
2786          difference between the groupsize and the last accessed
2787          element.
2788          When there is no gap, this difference should be 0.  */
2789       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2790
2791       DR_GROUP_SIZE (stmt_info) = groupsize;
2792       if (dump_enabled_p ())
2793         {
2794           dump_printf_loc (MSG_NOTE, vect_location,
2795                            "Detected interleaving ");
2796           if (DR_IS_READ (dr))
2797             dump_printf (MSG_NOTE, "load ");
2798           else if (STMT_VINFO_STRIDED_P (stmt_info))
2799             dump_printf (MSG_NOTE, "strided store ");
2800           else
2801             dump_printf (MSG_NOTE, "store ");
2802           dump_printf (MSG_NOTE, "of size %u\n",
2803                        (unsigned)groupsize);
2804           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2805           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2806           while (next)
2807             {
2808               if (DR_GROUP_GAP (next) != 1)
2809                 dump_printf_loc (MSG_NOTE, vect_location,
2810                                  "\t<gap of %d elements>\n",
2811                                  DR_GROUP_GAP (next) - 1);
2812               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2813               next = DR_GROUP_NEXT_ELEMENT (next);
2814             }
2815           if (DR_GROUP_GAP (stmt_info) != 0)
2816             dump_printf_loc (MSG_NOTE, vect_location,
2817                              "\t<gap of %d elements>\n",
2818                              DR_GROUP_GAP (stmt_info));
2819         }
2820
2821       /* SLP: create an SLP data structure for every interleaving group of
2822          stores for further analysis in vect_analyse_slp.  */
2823       if (DR_IS_WRITE (dr) && !slp_impossible)
2824         {
2825           if (loop_vinfo)
2826             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2827           if (bb_vinfo)
2828             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2829         }
2830     }
2831
2832   return true;
2833 }
2834
2835 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2836    accesses of legal size, step, etc.  Detect gaps, single element
2837    interleaving, and other special cases. Set grouped access info.
2838    Collect groups of strided stores for further use in SLP analysis.  */
2839
2840 static bool
2841 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2842 {
2843   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2844     {
2845       /* Dissolve the group if present.  */
2846       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2847       while (stmt_info)
2848         {
2849           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2850           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2851           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2852           stmt_info = next;
2853         }
2854       return false;
2855     }
2856   return true;
2857 }
2858
2859 /* Analyze the access pattern of the data-reference DR_INFO.
2860    In case of non-consecutive accesses call vect_analyze_group_access() to
2861    analyze groups of accesses.  */
2862
2863 static bool
2864 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2865 {
2866   data_reference *dr = dr_info->dr;
2867   tree step = DR_STEP (dr);
2868   tree scalar_type = TREE_TYPE (DR_REF (dr));
2869   stmt_vec_info stmt_info = dr_info->stmt;
2870   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2871   class loop *loop = NULL;
2872
2873   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2874     return true;
2875
2876   if (loop_vinfo)
2877     loop = LOOP_VINFO_LOOP (loop_vinfo);
2878
2879   if (loop_vinfo && !step)
2880     {
2881       if (dump_enabled_p ())
2882         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2883                          "bad data-ref access in loop\n");
2884       return false;
2885     }
2886
2887   /* Allow loads with zero step in inner-loop vectorization.  */
2888   if (loop_vinfo && integer_zerop (step))
2889     {
2890       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2891       if (!nested_in_vect_loop_p (loop, stmt_info))
2892         return DR_IS_READ (dr);
2893       /* Allow references with zero step for outer loops marked
2894          with pragma omp simd only - it guarantees absence of
2895          loop-carried dependencies between inner loop iterations.  */
2896       if (loop->safelen < 2)
2897         {
2898           if (dump_enabled_p ())
2899             dump_printf_loc (MSG_NOTE, vect_location,
2900                              "zero step in inner loop of nest\n");
2901           return false;
2902         }
2903     }
2904
2905   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2906     {
2907       /* Interleaved accesses are not yet supported within outer-loop
2908         vectorization for references in the inner-loop.  */
2909       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2910
2911       /* For the rest of the analysis we use the outer-loop step.  */
2912       step = STMT_VINFO_DR_STEP (stmt_info);
2913       if (integer_zerop (step))
2914         {
2915           if (dump_enabled_p ())
2916             dump_printf_loc (MSG_NOTE, vect_location,
2917                              "zero step in outer loop.\n");
2918           return DR_IS_READ (dr);
2919         }
2920     }
2921
2922   /* Consecutive?  */
2923   if (TREE_CODE (step) == INTEGER_CST)
2924     {
2925       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2926       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2927           || (dr_step < 0
2928               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2929         {
2930           /* Mark that it is not interleaving.  */
2931           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2932           return true;
2933         }
2934     }
2935
2936   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2937     {
2938       if (dump_enabled_p ())
2939         dump_printf_loc (MSG_NOTE, vect_location,
2940                          "grouped access in outer loop.\n");
2941       return false;
2942     }
2943
2944
2945   /* Assume this is a DR handled by non-constant strided load case.  */
2946   if (TREE_CODE (step) != INTEGER_CST)
2947     return (STMT_VINFO_STRIDED_P (stmt_info)
2948             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2949                 || vect_analyze_group_access (vinfo, dr_info)));
2950
2951   /* Not consecutive access - check if it's a part of interleaving group.  */
2952   return vect_analyze_group_access (vinfo, dr_info);
2953 }
2954
2955 /* Compare two data-references DRA and DRB to group them into chunks
2956    suitable for grouping.  */
2957
2958 static int
2959 dr_group_sort_cmp (const void *dra_, const void *drb_)
2960 {
2961   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2962   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2963   data_reference_p dra = dra_info->dr;
2964   data_reference_p drb = drb_info->dr;
2965   int cmp;
2966
2967   /* Stabilize sort.  */
2968   if (dra == drb)
2969     return 0;
2970
2971   /* Different group IDs lead never belong to the same group.  */
2972   if (dra_info->group != drb_info->group)
2973     return dra_info->group < drb_info->group ? -1 : 1;
2974
2975   /* Ordering of DRs according to base.  */
2976   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2977                                DR_BASE_ADDRESS (drb));
2978   if (cmp != 0)
2979     return cmp;
2980
2981   /* And according to DR_OFFSET.  */
2982   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2983   if (cmp != 0)
2984     return cmp;
2985
2986   /* Put reads before writes.  */
2987   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2988     return DR_IS_READ (dra) ? -1 : 1;
2989
2990   /* Then sort after access size.  */
2991   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2992                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2993   if (cmp != 0)
2994     return cmp;
2995
2996   /* And after step.  */
2997   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2998   if (cmp != 0)
2999     return cmp;
3000
3001   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3002   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3003   if (cmp == 0)
3004     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3005   return cmp;
3006 }
3007
3008 /* If OP is the result of a conversion, return the unconverted value,
3009    otherwise return null.  */
3010
3011 static tree
3012 strip_conversion (tree op)
3013 {
3014   if (TREE_CODE (op) != SSA_NAME)
3015     return NULL_TREE;
3016   gimple *stmt = SSA_NAME_DEF_STMT (op);
3017   if (!is_gimple_assign (stmt)
3018       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3019     return NULL_TREE;
3020   return gimple_assign_rhs1 (stmt);
3021 }
3022
3023 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3024    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3025    be grouped in SLP mode.  */
3026
3027 static bool
3028 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3029                    bool allow_slp_p)
3030 {
3031   if (gimple_assign_single_p (stmt1_info->stmt))
3032     return gimple_assign_single_p (stmt2_info->stmt);
3033
3034   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3035   if (call1 && gimple_call_internal_p (call1))
3036     {
3037       /* Check for two masked loads or two masked stores.  */
3038       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3039       if (!call2 || !gimple_call_internal_p (call2))
3040         return false;
3041       internal_fn ifn = gimple_call_internal_fn (call1);
3042       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3043         return false;
3044       if (ifn != gimple_call_internal_fn (call2))
3045         return false;
3046
3047       /* Check that the masks are the same.  Cope with casts of masks,
3048          like those created by build_mask_conversion.  */
3049       tree mask1 = gimple_call_arg (call1, 2);
3050       tree mask2 = gimple_call_arg (call2, 2);
3051       if (!operand_equal_p (mask1, mask2, 0)
3052           && (ifn == IFN_MASK_STORE || !allow_slp_p))
3053         {
3054           mask1 = strip_conversion (mask1);
3055           if (!mask1)
3056             return false;
3057           mask2 = strip_conversion (mask2);
3058           if (!mask2)
3059             return false;
3060           if (!operand_equal_p (mask1, mask2, 0))
3061             return false;
3062         }
3063       return true;
3064     }
3065
3066   return false;
3067 }
3068
3069 /* Function vect_analyze_data_ref_accesses.
3070
3071    Analyze the access pattern of all the data references in the loop.
3072
3073    FORNOW: the only access pattern that is considered vectorizable is a
3074            simple step 1 (consecutive) access.
3075
3076    FORNOW: handle only arrays and pointer accesses.  */
3077
3078 opt_result
3079 vect_analyze_data_ref_accesses (vec_info *vinfo,
3080                                 vec<int> *dataref_groups)
3081 {
3082   unsigned int i;
3083   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3084
3085   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3086
3087   if (datarefs.is_empty ())
3088     return opt_result::success ();
3089
3090   /* Sort the array of datarefs to make building the interleaving chains
3091      linear.  Don't modify the original vector's order, it is needed for
3092      determining what dependencies are reversed.  */
3093   vec<dr_vec_info *> datarefs_copy;
3094   datarefs_copy.create (datarefs.length ());
3095   for (unsigned i = 0; i < datarefs.length (); i++)
3096     {
3097       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3098       /* If the caller computed DR grouping use that, otherwise group by
3099          basic blocks.  */
3100       if (dataref_groups)
3101         dr_info->group = (*dataref_groups)[i];
3102       else
3103         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3104       datarefs_copy.quick_push (dr_info);
3105     }
3106   datarefs_copy.qsort (dr_group_sort_cmp);
3107   hash_set<stmt_vec_info> to_fixup;
3108
3109   /* Build the interleaving chains.  */
3110   for (i = 0; i < datarefs_copy.length () - 1;)
3111     {
3112       dr_vec_info *dr_info_a = datarefs_copy[i];
3113       data_reference_p dra = dr_info_a->dr;
3114       int dra_group_id = dr_info_a->group;
3115       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3116       stmt_vec_info lastinfo = NULL;
3117       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3118           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3119         {
3120           ++i;
3121           continue;
3122         }
3123       for (i = i + 1; i < datarefs_copy.length (); ++i)
3124         {
3125           dr_vec_info *dr_info_b = datarefs_copy[i];
3126           data_reference_p drb = dr_info_b->dr;
3127           int drb_group_id = dr_info_b->group;
3128           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3129           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3130               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3131             break;
3132
3133           /* ???  Imperfect sorting (non-compatible types, non-modulo
3134              accesses, same accesses) can lead to a group to be artificially
3135              split here as we don't just skip over those.  If it really
3136              matters we can push those to a worklist and re-iterate
3137              over them.  The we can just skip ahead to the next DR here.  */
3138
3139           /* DRs in a different DR group should not be put into the same
3140              interleaving group.  */
3141           if (dra_group_id != drb_group_id)
3142             break;
3143
3144           /* Check that the data-refs have same first location (except init)
3145              and they are both either store or load (not load and store,
3146              not masked loads or stores).  */
3147           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3148               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3149                                         DR_BASE_ADDRESS (drb)) != 0
3150               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3151               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3152             break;
3153
3154           /* Check that the data-refs have the same constant size.  */
3155           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3156           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3157           if (!tree_fits_uhwi_p (sza)
3158               || !tree_fits_uhwi_p (szb)
3159               || !tree_int_cst_equal (sza, szb))
3160             break;
3161
3162           /* Check that the data-refs have the same step.  */
3163           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3164             break;
3165
3166           /* Check the types are compatible.
3167              ???  We don't distinguish this during sorting.  */
3168           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3169                                    TREE_TYPE (DR_REF (drb))))
3170             break;
3171
3172           /* Check that the DR_INITs are compile-time constants.  */
3173           if (!tree_fits_shwi_p (DR_INIT (dra))
3174               || !tree_fits_shwi_p (DR_INIT (drb)))
3175             break;
3176
3177           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3178              just hold extra information.  */
3179           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3180               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3181               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3182             break;
3183
3184           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3185           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3186           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3187           HOST_WIDE_INT init_prev
3188             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3189           gcc_assert (init_a <= init_b
3190                       && init_a <= init_prev
3191                       && init_prev <= init_b);
3192
3193           /* Do not place the same access in the interleaving chain twice.  */
3194           if (init_b == init_prev)
3195             {
3196               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3197                           < gimple_uid (DR_STMT (drb)));
3198               /* Simply link in duplicates and fix up the chain below.  */
3199             }
3200           else
3201             {
3202               /* If init_b == init_a + the size of the type * k, we have an
3203                  interleaving, and DRA is accessed before DRB.  */
3204               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3205               if (type_size_a == 0
3206                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3207                       % type_size_a != 0))
3208                 break;
3209
3210               /* If we have a store, the accesses are adjacent.  This splits
3211                  groups into chunks we support (we don't support vectorization
3212                  of stores with gaps).  */
3213               if (!DR_IS_READ (dra)
3214                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3215                       != type_size_a))
3216                 break;
3217
3218               /* If the step (if not zero or non-constant) is smaller than the
3219                  difference between data-refs' inits this splits groups into
3220                  suitable sizes.  */
3221               if (tree_fits_shwi_p (DR_STEP (dra)))
3222                 {
3223                   unsigned HOST_WIDE_INT step
3224                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3225                   if (step != 0
3226                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3227                     break;
3228                 }
3229             }
3230
3231           if (dump_enabled_p ())
3232             dump_printf_loc (MSG_NOTE, vect_location,
3233                              DR_IS_READ (dra)
3234                              ? "Detected interleaving load %T and %T\n"
3235                              : "Detected interleaving store %T and %T\n",
3236                              DR_REF (dra), DR_REF (drb));
3237
3238           /* Link the found element into the group list.  */
3239           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3240             {
3241               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3242               lastinfo = stmtinfo_a;
3243             }
3244           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3245           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3246           lastinfo = stmtinfo_b;
3247
3248           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3249             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3250
3251           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3252             dump_printf_loc (MSG_NOTE, vect_location,
3253                              "Load suitable for SLP vectorization only.\n");
3254
3255           if (init_b == init_prev
3256               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3257               && dump_enabled_p ())
3258             dump_printf_loc (MSG_NOTE, vect_location,
3259                              "Queuing group with duplicate access for fixup\n");
3260         }
3261     }
3262
3263   /* Fixup groups with duplicate entries by splitting it.  */
3264   while (1)
3265     {
3266       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3267       if (!(it != to_fixup.end ()))
3268         break;
3269       stmt_vec_info grp = *it;
3270       to_fixup.remove (grp);
3271
3272       /* Find the earliest duplicate group member.  */
3273       unsigned first_duplicate = -1u;
3274       stmt_vec_info next, g = grp;
3275       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3276         {
3277           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3278                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3279               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3280             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3281           g = next;
3282         }
3283       if (first_duplicate == -1U)
3284         continue;
3285
3286       /* Then move all stmts after the first duplicate to a new group.
3287          Note this is a heuristic but one with the property that *it
3288          is fixed up completely.  */
3289       g = grp;
3290       stmt_vec_info newgroup = NULL, ng = grp;
3291       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3292         {
3293           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3294             {
3295               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3296               if (!newgroup)
3297                 newgroup = next;
3298               else
3299                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3300               ng = next;
3301               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3302             }
3303           else
3304             g = DR_GROUP_NEXT_ELEMENT (g);
3305         }
3306       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3307
3308       /* Fixup the new group which still may contain duplicates.  */
3309       to_fixup.add (newgroup);
3310     }
3311
3312   dr_vec_info *dr_info;
3313   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3314     {
3315       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3316           && !vect_analyze_data_ref_access (vinfo, dr_info))
3317         {
3318           if (dump_enabled_p ())
3319             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3320                              "not vectorized: complicated access pattern.\n");
3321
3322           if (is_a <bb_vec_info> (vinfo))
3323             {
3324               /* Mark the statement as not vectorizable.  */
3325               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3326               continue;
3327             }
3328           else
3329             {
3330               datarefs_copy.release ();
3331               return opt_result::failure_at (dr_info->stmt->stmt,
3332                                              "not vectorized:"
3333                                              " complicated access pattern.\n");
3334             }
3335         }
3336     }
3337
3338   datarefs_copy.release ();
3339   return opt_result::success ();
3340 }
3341
3342 /* Function vect_vfa_segment_size.
3343
3344    Input:
3345      DR_INFO: The data reference.
3346      LENGTH_FACTOR: segment length to consider.
3347
3348    Return a value suitable for the dr_with_seg_len::seg_len field.
3349    This is the "distance travelled" by the pointer from the first
3350    iteration in the segment to the last.  Note that it does not include
3351    the size of the access; in effect it only describes the first byte.  */
3352
3353 static tree
3354 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3355 {
3356   length_factor = size_binop (MINUS_EXPR,
3357                               fold_convert (sizetype, length_factor),
3358                               size_one_node);
3359   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3360                      length_factor);
3361 }
3362
3363 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3364    gives the worst-case number of bytes covered by the segment.  */
3365
3366 static unsigned HOST_WIDE_INT
3367 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3368 {
3369   stmt_vec_info stmt_vinfo = dr_info->stmt;
3370   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3371   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3372   unsigned HOST_WIDE_INT access_size = ref_size;
3373   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3374     {
3375       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3376       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3377     }
3378   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3379   int misalignment;
3380   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3381       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3382       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3383           == dr_explicit_realign_optimized))
3384     {
3385       /* We might access a full vector's worth.  */
3386       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3387     }
3388   return access_size;
3389 }
3390
3391 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3392    describes.  */
3393
3394 static unsigned int
3395 vect_vfa_align (dr_vec_info *dr_info)
3396 {
3397   return dr_alignment (dr_info->dr);
3398 }
3399
3400 /* Function vect_no_alias_p.
3401
3402    Given data references A and B with equal base and offset, see whether
3403    the alias relation can be decided at compilation time.  Return 1 if
3404    it can and the references alias, 0 if it can and the references do
3405    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3406    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3407    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3408
3409 static int
3410 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3411                          tree segment_length_a, tree segment_length_b,
3412                          unsigned HOST_WIDE_INT access_size_a,
3413                          unsigned HOST_WIDE_INT access_size_b)
3414 {
3415   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3416   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3417   poly_uint64 const_length_a;
3418   poly_uint64 const_length_b;
3419
3420   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3421      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3422      [a, a+12) */
3423   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3424     {
3425       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3426       offset_a -= const_length_a;
3427     }
3428   else
3429     const_length_a = tree_to_poly_uint64 (segment_length_a);
3430   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3431     {
3432       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3433       offset_b -= const_length_b;
3434     }
3435   else
3436     const_length_b = tree_to_poly_uint64 (segment_length_b);
3437
3438   const_length_a += access_size_a;
3439   const_length_b += access_size_b;
3440
3441   if (ranges_known_overlap_p (offset_a, const_length_a,
3442                               offset_b, const_length_b))
3443     return 1;
3444
3445   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3446                                offset_b, const_length_b))
3447     return 0;
3448
3449   return -1;
3450 }
3451
3452 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3453    in DDR is >= VF.  */
3454
3455 static bool
3456 dependence_distance_ge_vf (data_dependence_relation *ddr,
3457                            unsigned int loop_depth, poly_uint64 vf)
3458 {
3459   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3460       || DDR_NUM_DIST_VECTS (ddr) == 0)
3461     return false;
3462
3463   /* If the dependence is exact, we should have limited the VF instead.  */
3464   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3465
3466   unsigned int i;
3467   lambda_vector dist_v;
3468   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3469     {
3470       HOST_WIDE_INT dist = dist_v[loop_depth];
3471       if (dist != 0
3472           && !(dist > 0 && DDR_REVERSED_P (ddr))
3473           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3474         return false;
3475     }
3476
3477   if (dump_enabled_p ())
3478     dump_printf_loc (MSG_NOTE, vect_location,
3479                      "dependence distance between %T and %T is >= VF\n",
3480                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3481
3482   return true;
3483 }
3484
3485 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3486
3487 static void
3488 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3489 {
3490   dump_printf (dump_kind, "%s (%T) >= ",
3491                lower_bound.unsigned_p ? "unsigned" : "abs",
3492                lower_bound.expr);
3493   dump_dec (dump_kind, lower_bound.min_value);
3494 }
3495
3496 /* Record that the vectorized loop requires the vec_lower_bound described
3497    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3498
3499 static void
3500 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3501                         poly_uint64 min_value)
3502 {
3503   vec<vec_lower_bound> &lower_bounds
3504     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3505   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3506     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3507       {
3508         unsigned_p &= lower_bounds[i].unsigned_p;
3509         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3510         if (lower_bounds[i].unsigned_p != unsigned_p
3511             || maybe_lt (lower_bounds[i].min_value, min_value))
3512           {
3513             lower_bounds[i].unsigned_p = unsigned_p;
3514             lower_bounds[i].min_value = min_value;
3515             if (dump_enabled_p ())
3516               {
3517                 dump_printf_loc (MSG_NOTE, vect_location,
3518                                  "updating run-time check to ");
3519                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3520                 dump_printf (MSG_NOTE, "\n");
3521               }
3522           }
3523         return;
3524       }
3525
3526   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3527   if (dump_enabled_p ())
3528     {
3529       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3530       dump_lower_bound (MSG_NOTE, lower_bound);
3531       dump_printf (MSG_NOTE, "\n");
3532     }
3533   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3534 }
3535
3536 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3537    will span fewer than GAP bytes.  */
3538
3539 static bool
3540 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3541                   poly_int64 gap)
3542 {
3543   stmt_vec_info stmt_info = dr_info->stmt;
3544   HOST_WIDE_INT count
3545     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3546   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3547     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3548   return (estimated_poly_value (gap)
3549           <= count * vect_get_scalar_dr_size (dr_info));
3550 }
3551
3552 /* Return true if we know that there is no alias between DR_INFO_A and
3553    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3554    When returning true, set *LOWER_BOUND_OUT to this N.  */
3555
3556 static bool
3557 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3558                                 poly_uint64 *lower_bound_out)
3559 {
3560   /* Check that there is a constant gap of known sign between DR_A
3561      and DR_B.  */
3562   data_reference *dr_a = dr_info_a->dr;
3563   data_reference *dr_b = dr_info_b->dr;
3564   poly_int64 init_a, init_b;
3565   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3566       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3567       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3568       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3569       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3570       || !ordered_p (init_a, init_b))
3571     return false;
3572
3573   /* Sort DR_A and DR_B by the address they access.  */
3574   if (maybe_lt (init_b, init_a))
3575     {
3576       std::swap (init_a, init_b);
3577       std::swap (dr_info_a, dr_info_b);
3578       std::swap (dr_a, dr_b);
3579     }
3580
3581   /* If the two accesses could be dependent within a scalar iteration,
3582      make sure that we'd retain their order.  */
3583   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3584       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3585     return false;
3586
3587   /* There is no alias if abs (DR_STEP) is greater than or equal to
3588      the bytes spanned by the combination of the two accesses.  */
3589   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3590   return true;
3591 }
3592
3593 /* Function vect_prune_runtime_alias_test_list.
3594
3595    Prune a list of ddrs to be tested at run-time by versioning for alias.
3596    Merge several alias checks into one if possible.
3597    Return FALSE if resulting list of ddrs is longer then allowed by
3598    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3599
3600 opt_result
3601 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3602 {
3603   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3604   hash_set <tree_pair_hash> compared_objects;
3605
3606   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3607   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3608     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3609   const vec<vec_object_pair> &check_unequal_addrs
3610     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3611   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3612   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3613
3614   ddr_p ddr;
3615   unsigned int i;
3616   tree length_factor;
3617
3618   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3619
3620   /* Step values are irrelevant for aliasing if the number of vector
3621      iterations is equal to the number of scalar iterations (which can
3622      happen for fully-SLP loops).  */
3623   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3624
3625   if (!vf_one_p)
3626     {
3627       /* Convert the checks for nonzero steps into bound tests.  */
3628       tree value;
3629       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3630         vect_check_lower_bound (loop_vinfo, value, true, 1);
3631     }
3632
3633   if (may_alias_ddrs.is_empty ())
3634     return opt_result::success ();
3635
3636   comp_alias_ddrs.create (may_alias_ddrs.length ());
3637
3638   unsigned int loop_depth
3639     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3640                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3641
3642   /* First, we collect all data ref pairs for aliasing checks.  */
3643   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3644     {
3645       poly_uint64 lower_bound;
3646       tree segment_length_a, segment_length_b;
3647       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3648       unsigned int align_a, align_b;
3649
3650       /* Ignore the alias if the VF we chose ended up being no greater
3651          than the dependence distance.  */
3652       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3653         continue;
3654
3655       if (DDR_OBJECT_A (ddr))
3656         {
3657           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3658           if (!compared_objects.add (new_pair))
3659             {
3660               if (dump_enabled_p ())
3661                 dump_printf_loc (MSG_NOTE, vect_location,
3662                                  "checking that %T and %T"
3663                                  " have different addresses\n",
3664                                  new_pair.first, new_pair.second);
3665               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3666             }
3667           continue;
3668         }
3669
3670       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3671       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3672
3673       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3674       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3675
3676       bool preserves_scalar_order_p
3677         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3678       bool ignore_step_p
3679           = (vf_one_p
3680              && (preserves_scalar_order_p
3681                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3682                                      DR_STEP (dr_info_b->dr))));
3683
3684       /* Skip the pair if inter-iteration dependencies are irrelevant
3685          and intra-iteration dependencies are guaranteed to be honored.  */
3686       if (ignore_step_p
3687           && (preserves_scalar_order_p
3688               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3689                                                  &lower_bound)))
3690         {
3691           if (dump_enabled_p ())
3692             dump_printf_loc (MSG_NOTE, vect_location,
3693                              "no need for alias check between "
3694                              "%T and %T when VF is 1\n",
3695                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3696           continue;
3697         }
3698
3699       /* See whether we can handle the alias using a bounds check on
3700          the step, and whether that's likely to be the best approach.
3701          (It might not be, for example, if the minimum step is much larger
3702          than the number of bytes handled by one vector iteration.)  */
3703       if (!ignore_step_p
3704           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3705           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3706                                              &lower_bound)
3707           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3708               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3709         {
3710           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3711           if (dump_enabled_p ())
3712             {
3713               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3714                                "%T and %T when the step %T is outside ",
3715                                DR_REF (dr_info_a->dr),
3716                                DR_REF (dr_info_b->dr),
3717                                DR_STEP (dr_info_a->dr));
3718               if (unsigned_p)
3719                 dump_printf (MSG_NOTE, "[0");
3720               else
3721                 {
3722                   dump_printf (MSG_NOTE, "(");
3723                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3724                 }
3725               dump_printf (MSG_NOTE, ", ");
3726               dump_dec (MSG_NOTE, lower_bound);
3727               dump_printf (MSG_NOTE, ")\n");
3728             }
3729           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3730                                   unsigned_p, lower_bound);
3731           continue;
3732         }
3733
3734       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3735       if (dr_group_first_a)
3736         {
3737           stmt_info_a = dr_group_first_a;
3738           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3739         }
3740
3741       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3742       if (dr_group_first_b)
3743         {
3744           stmt_info_b = dr_group_first_b;
3745           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3746         }
3747
3748       if (ignore_step_p)
3749         {
3750           segment_length_a = size_zero_node;
3751           segment_length_b = size_zero_node;
3752         }
3753       else
3754         {
3755           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3756                                 DR_STEP (dr_info_b->dr), 0))
3757             length_factor = scalar_loop_iters;
3758           else
3759             length_factor = size_int (vect_factor);
3760           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3761           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3762         }
3763       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3764       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3765       align_a = vect_vfa_align (dr_info_a);
3766       align_b = vect_vfa_align (dr_info_b);
3767
3768       /* See whether the alias is known at compilation time.  */
3769       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3770                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3771           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3772                               DR_OFFSET (dr_info_b->dr), 0)
3773           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3774           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3775           && poly_int_tree_p (segment_length_a)
3776           && poly_int_tree_p (segment_length_b))
3777         {
3778           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3779                                              segment_length_a,
3780                                              segment_length_b,
3781                                              access_size_a,
3782                                              access_size_b);
3783           if (res >= 0 && dump_enabled_p ())
3784             {
3785               dump_printf_loc (MSG_NOTE, vect_location,
3786                                "can tell at compile time that %T and %T",
3787                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3788               if (res == 0)
3789                 dump_printf (MSG_NOTE, " do not alias\n");
3790               else
3791                 dump_printf (MSG_NOTE, " alias\n");
3792             }
3793
3794           if (res == 0)
3795             continue;
3796
3797           if (res == 1)
3798             return opt_result::failure_at (stmt_info_b->stmt,
3799                                            "not vectorized:"
3800                                            " compilation time alias: %G%G",
3801                                            stmt_info_a->stmt,
3802                                            stmt_info_b->stmt);
3803         }
3804
3805       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3806                             access_size_a, align_a);
3807       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3808                             access_size_b, align_b);
3809       /* Canonicalize the order to be the one that's needed for accurate
3810          RAW, WAR and WAW flags, in cases where the data references are
3811          well-ordered.  The order doesn't really matter otherwise,
3812          but we might as well be consistent.  */
3813       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3814         std::swap (dr_a, dr_b);
3815
3816       dr_with_seg_len_pair_t dr_with_seg_len_pair
3817         (dr_a, dr_b, (preserves_scalar_order_p
3818                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3819                       : dr_with_seg_len_pair_t::REORDERED));
3820
3821       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3822     }
3823
3824   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3825
3826   unsigned int count = (comp_alias_ddrs.length ()
3827                         + check_unequal_addrs.length ());
3828
3829   if (count
3830       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3831           == VECT_COST_MODEL_VERY_CHEAP))
3832     return opt_result::failure_at
3833       (vect_location, "would need a runtime alias check\n");
3834
3835   if (dump_enabled_p ())
3836     dump_printf_loc (MSG_NOTE, vect_location,
3837                      "improved number of alias checks from %d to %d\n",
3838                      may_alias_ddrs.length (), count);
3839   unsigned limit = param_vect_max_version_for_alias_checks;
3840   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3841     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3842   if (count > limit)
3843     return opt_result::failure_at
3844       (vect_location,
3845        "number of versioning for alias run-time tests exceeds %d "
3846        "(--param vect-max-version-for-alias-checks)\n", limit);
3847
3848   return opt_result::success ();
3849 }
3850
3851 /* Check whether we can use an internal function for a gather load
3852    or scatter store.  READ_P is true for loads and false for stores.
3853    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3854    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3855    is the type of the offset that is being applied to the invariant
3856    base address.  SCALE is the amount by which the offset should
3857    be multiplied *after* it has been converted to address width.
3858
3859    Return true if the function is supported, storing the function id in
3860    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3861
3862 bool
3863 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3864                           tree vectype, tree memory_type, tree offset_type,
3865                           int scale, internal_fn *ifn_out,
3866                           tree *offset_vectype_out)
3867 {
3868   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3869   unsigned int element_bits = vector_element_bits (vectype);
3870   if (element_bits != memory_bits)
3871     /* For now the vector elements must be the same width as the
3872        memory elements.  */
3873     return false;
3874
3875   /* Work out which function we need.  */
3876   internal_fn ifn, alt_ifn, alt_ifn2;
3877   if (read_p)
3878     {
3879       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3880       alt_ifn = IFN_MASK_GATHER_LOAD;
3881       /* When target supports LEN_MASK_GATHER_LOAD, we always
3882          use LEN_MASK_GATHER_LOAD regardless whether len and
3883          mask are valid or not.  */
3884       alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
3885     }
3886   else
3887     {
3888       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3889       alt_ifn = IFN_MASK_SCATTER_STORE;
3890       /* When target supports LEN_MASK_SCATTER_STORE, we always
3891          use LEN_MASK_SCATTER_STORE regardless whether len and
3892          mask are valid or not.  */
3893       alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
3894     }
3895
3896   for (;;)
3897     {
3898       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3899       if (!offset_vectype)
3900         return false;
3901
3902       /* Test whether the target supports this combination.  */
3903       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3904                                                   offset_vectype, scale))
3905         {
3906           *ifn_out = ifn;
3907           *offset_vectype_out = offset_vectype;
3908           return true;
3909         }
3910       else if (!masked_p
3911                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3912                                                           memory_type,
3913                                                           offset_vectype,
3914                                                           scale))
3915         {
3916           *ifn_out = alt_ifn;
3917           *offset_vectype_out = offset_vectype;
3918           return true;
3919         }
3920       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
3921                                                        memory_type,
3922                                                        offset_vectype, scale))
3923         {
3924           *ifn_out = alt_ifn2;
3925           *offset_vectype_out = offset_vectype;
3926           return true;
3927         }
3928
3929       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3930           && TYPE_PRECISION (offset_type) >= element_bits)
3931         return false;
3932
3933       offset_type = build_nonstandard_integer_type
3934         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3935     }
3936 }
3937
3938 /* STMT_INFO is a call to an internal gather load or scatter store function.
3939    Describe the operation in INFO.  */
3940
3941 static void
3942 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3943                                    gather_scatter_info *info)
3944 {
3945   gcall *call = as_a <gcall *> (stmt_info->stmt);
3946   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3947   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3948
3949   info->ifn = gimple_call_internal_fn (call);
3950   info->decl = NULL_TREE;
3951   info->base = gimple_call_arg (call, 0);
3952   info->offset = gimple_call_arg (call, 1);
3953   info->offset_dt = vect_unknown_def_type;
3954   info->offset_vectype = NULL_TREE;
3955   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3956   info->element_type = TREE_TYPE (vectype);
3957   info->memory_type = TREE_TYPE (DR_REF (dr));
3958 }
3959
3960 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3961    gather load or scatter store.  Describe the operation in *INFO if so.  */
3962
3963 bool
3964 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3965                            gather_scatter_info *info)
3966 {
3967   HOST_WIDE_INT scale = 1;
3968   poly_int64 pbitpos, pbitsize;
3969   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3970   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3971   tree offtype = NULL_TREE;
3972   tree decl = NULL_TREE, base, off;
3973   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3974   tree memory_type = TREE_TYPE (DR_REF (dr));
3975   machine_mode pmode;
3976   int punsignedp, reversep, pvolatilep = 0;
3977   internal_fn ifn;
3978   tree offset_vectype;
3979   bool masked_p = false;
3980
3981   /* See whether this is already a call to a gather/scatter internal function.
3982      If not, see whether it's a masked load or store.  */
3983   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3984   if (call && gimple_call_internal_p (call))
3985     {
3986       ifn = gimple_call_internal_fn (call);
3987       if (internal_gather_scatter_fn_p (ifn))
3988         {
3989           vect_describe_gather_scatter_call (stmt_info, info);
3990           return true;
3991         }
3992       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3993     }
3994
3995   /* True if we should aim to use internal functions rather than
3996      built-in functions.  */
3997   bool use_ifn_p = (DR_IS_READ (dr)
3998                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3999                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4000
4001   base = DR_REF (dr);
4002   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4003      see if we can use the def stmt of the address.  */
4004   if (masked_p
4005       && TREE_CODE (base) == MEM_REF
4006       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4007       && integer_zerop (TREE_OPERAND (base, 1))
4008       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4009     {
4010       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4011       if (is_gimple_assign (def_stmt)
4012           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4013         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4014     }
4015
4016   /* The gather and scatter builtins need address of the form
4017      loop_invariant + vector * {1, 2, 4, 8}
4018      or
4019      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4020      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4021      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4022      multiplications and additions in it.  To get a vector, we need
4023      a single SSA_NAME that will be defined in the loop and will
4024      contain everything that is not loop invariant and that can be
4025      vectorized.  The following code attempts to find such a preexistng
4026      SSA_NAME OFF and put the loop invariants into a tree BASE
4027      that can be gimplified before the loop.  */
4028   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4029                               &punsignedp, &reversep, &pvolatilep);
4030   if (reversep)
4031     return false;
4032
4033   /* PR 107346.  Packed structs can have fields at offsets that are not
4034      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4035   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4036     return false;
4037
4038   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4039
4040   if (TREE_CODE (base) == MEM_REF)
4041     {
4042       if (!integer_zerop (TREE_OPERAND (base, 1)))
4043         {
4044           if (off == NULL_TREE)
4045             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4046           else
4047             off = size_binop (PLUS_EXPR, off,
4048                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4049         }
4050       base = TREE_OPERAND (base, 0);
4051     }
4052   else
4053     base = build_fold_addr_expr (base);
4054
4055   if (off == NULL_TREE)
4056     off = size_zero_node;
4057
4058   /* If base is not loop invariant, either off is 0, then we start with just
4059      the constant offset in the loop invariant BASE and continue with base
4060      as OFF, otherwise give up.
4061      We could handle that case by gimplifying the addition of base + off
4062      into some SSA_NAME and use that as off, but for now punt.  */
4063   if (!expr_invariant_in_loop_p (loop, base))
4064     {
4065       if (!integer_zerop (off))
4066         return false;
4067       off = base;
4068       base = size_int (pbytepos);
4069     }
4070   /* Otherwise put base + constant offset into the loop invariant BASE
4071      and continue with OFF.  */
4072   else
4073     {
4074       base = fold_convert (sizetype, base);
4075       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4076     }
4077
4078   /* OFF at this point may be either a SSA_NAME or some tree expression
4079      from get_inner_reference.  Try to peel off loop invariants from it
4080      into BASE as long as possible.  */
4081   STRIP_NOPS (off);
4082   while (offtype == NULL_TREE)
4083     {
4084       enum tree_code code;
4085       tree op0, op1, add = NULL_TREE;
4086
4087       if (TREE_CODE (off) == SSA_NAME)
4088         {
4089           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4090
4091           if (expr_invariant_in_loop_p (loop, off))
4092             return false;
4093
4094           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4095             break;
4096
4097           op0 = gimple_assign_rhs1 (def_stmt);
4098           code = gimple_assign_rhs_code (def_stmt);
4099           op1 = gimple_assign_rhs2 (def_stmt);
4100         }
4101       else
4102         {
4103           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4104             return false;
4105           code = TREE_CODE (off);
4106           extract_ops_from_tree (off, &code, &op0, &op1);
4107         }
4108       switch (code)
4109         {
4110         case POINTER_PLUS_EXPR:
4111         case PLUS_EXPR:
4112           if (expr_invariant_in_loop_p (loop, op0))
4113             {
4114               add = op0;
4115               off = op1;
4116             do_add:
4117               add = fold_convert (sizetype, add);
4118               if (scale != 1)
4119                 add = size_binop (MULT_EXPR, add, size_int (scale));
4120               base = size_binop (PLUS_EXPR, base, add);
4121               continue;
4122             }
4123           if (expr_invariant_in_loop_p (loop, op1))
4124             {
4125               add = op1;
4126               off = op0;
4127               goto do_add;
4128             }
4129           break;
4130         case MINUS_EXPR:
4131           if (expr_invariant_in_loop_p (loop, op1))
4132             {
4133               add = fold_convert (sizetype, op1);
4134               add = size_binop (MINUS_EXPR, size_zero_node, add);
4135               off = op0;
4136               goto do_add;
4137             }
4138           break;
4139         case MULT_EXPR:
4140           if (scale == 1 && tree_fits_shwi_p (op1))
4141             {
4142               int new_scale = tree_to_shwi (op1);
4143               /* Only treat this as a scaling operation if the target
4144                  supports it for at least some offset type.  */
4145               if (use_ifn_p
4146                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4147                                                 masked_p, vectype, memory_type,
4148                                                 signed_char_type_node,
4149                                                 new_scale, &ifn,
4150                                                 &offset_vectype)
4151                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4152                                                 masked_p, vectype, memory_type,
4153                                                 unsigned_char_type_node,
4154                                                 new_scale, &ifn,
4155                                                 &offset_vectype))
4156                 break;
4157               scale = new_scale;
4158               off = op0;
4159               continue;
4160             }
4161           break;
4162         case SSA_NAME:
4163           off = op0;
4164           continue;
4165         CASE_CONVERT:
4166           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4167               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4168             break;
4169
4170           /* Don't include the conversion if the target is happy with
4171              the current offset type.  */
4172           if (use_ifn_p
4173               && TREE_CODE (off) == SSA_NAME
4174               && !POINTER_TYPE_P (TREE_TYPE (off))
4175               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4176                                            masked_p, vectype, memory_type,
4177                                            TREE_TYPE (off), scale, &ifn,
4178                                            &offset_vectype))
4179             break;
4180
4181           if (TYPE_PRECISION (TREE_TYPE (op0))
4182               == TYPE_PRECISION (TREE_TYPE (off)))
4183             {
4184               off = op0;
4185               continue;
4186             }
4187
4188           /* Include the conversion if it is widening and we're using
4189              the IFN path or the target can handle the converted from
4190              offset or the current size is not already the same as the
4191              data vector element size.  */
4192           if ((TYPE_PRECISION (TREE_TYPE (op0))
4193                < TYPE_PRECISION (TREE_TYPE (off)))
4194               && (use_ifn_p
4195                   || (DR_IS_READ (dr)
4196                       ? (targetm.vectorize.builtin_gather
4197                          && targetm.vectorize.builtin_gather (vectype,
4198                                                               TREE_TYPE (op0),
4199                                                               scale))
4200                       : (targetm.vectorize.builtin_scatter
4201                          && targetm.vectorize.builtin_scatter (vectype,
4202                                                                TREE_TYPE (op0),
4203                                                                scale)))
4204                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4205                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4206             {
4207               off = op0;
4208               offtype = TREE_TYPE (off);
4209               STRIP_NOPS (off);
4210               continue;
4211             }
4212           break;
4213         default:
4214           break;
4215         }
4216       break;
4217     }
4218
4219   /* If at the end OFF still isn't a SSA_NAME or isn't
4220      defined in the loop, punt.  */
4221   if (TREE_CODE (off) != SSA_NAME
4222       || expr_invariant_in_loop_p (loop, off))
4223     return false;
4224
4225   if (offtype == NULL_TREE)
4226     offtype = TREE_TYPE (off);
4227
4228   if (use_ifn_p)
4229     {
4230       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4231                                      vectype, memory_type, offtype, scale,
4232                                      &ifn, &offset_vectype))
4233         ifn = IFN_LAST;
4234       decl = NULL_TREE;
4235     }
4236   else
4237     {
4238       if (DR_IS_READ (dr))
4239         {
4240           if (targetm.vectorize.builtin_gather)
4241             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4242         }
4243       else
4244         {
4245           if (targetm.vectorize.builtin_scatter)
4246             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4247         }
4248       ifn = IFN_LAST;
4249       /* The offset vector type will be read from DECL when needed.  */
4250       offset_vectype = NULL_TREE;
4251     }
4252
4253   info->ifn = ifn;
4254   info->decl = decl;
4255   info->base = base;
4256   info->offset = off;
4257   info->offset_dt = vect_unknown_def_type;
4258   info->offset_vectype = offset_vectype;
4259   info->scale = scale;
4260   info->element_type = TREE_TYPE (vectype);
4261   info->memory_type = memory_type;
4262   return true;
4263 }
4264
4265 /* Find the data references in STMT, analyze them with respect to LOOP and
4266    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4267    be handled.  */
4268
4269 opt_result
4270 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4271                                vec<data_reference_p> *datarefs,
4272                                vec<int> *dataref_groups, int group_id)
4273 {
4274   /* We can ignore clobbers for dataref analysis - they are removed during
4275      loop vectorization and BB vectorization checks dependences with a
4276      stmt walk.  */
4277   if (gimple_clobber_p (stmt))
4278     return opt_result::success ();
4279
4280   if (gimple_has_volatile_ops (stmt))
4281     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4282                                    stmt);
4283
4284   if (stmt_can_throw_internal (cfun, stmt))
4285     return opt_result::failure_at (stmt,
4286                                    "not vectorized:"
4287                                    " statement can throw an exception: %G",
4288                                    stmt);
4289
4290   auto_vec<data_reference_p, 2> refs;
4291   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4292   if (!res)
4293     return res;
4294
4295   if (refs.is_empty ())
4296     return opt_result::success ();
4297
4298   if (refs.length () > 1)
4299     {
4300       while (!refs.is_empty ())
4301         free_data_ref (refs.pop ());
4302       return opt_result::failure_at (stmt,
4303                                      "not vectorized: more than one "
4304                                      "data ref in stmt: %G", stmt);
4305     }
4306
4307   data_reference_p dr = refs.pop ();
4308   if (gcall *call = dyn_cast <gcall *> (stmt))
4309     if (!gimple_call_internal_p (call)
4310         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4311             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4312       {
4313         free_data_ref (dr);
4314         return opt_result::failure_at (stmt,
4315                                        "not vectorized: dr in a call %G", stmt);
4316       }
4317
4318   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4319       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4320     {
4321       free_data_ref (dr);
4322       return opt_result::failure_at (stmt,
4323                                      "not vectorized:"
4324                                      " statement is an unsupported"
4325                                      " bitfield access %G", stmt);
4326     }
4327
4328   if (DR_BASE_ADDRESS (dr)
4329       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4330     {
4331       free_data_ref (dr);
4332       return opt_result::failure_at (stmt,
4333                                      "not vectorized:"
4334                                      " base addr of dr is a constant\n");
4335     }
4336
4337   /* Check whether this may be a SIMD lane access and adjust the
4338      DR to make it easier for us to handle it.  */
4339   if (loop
4340       && loop->simduid
4341       && (!DR_BASE_ADDRESS (dr)
4342           || !DR_OFFSET (dr)
4343           || !DR_INIT (dr)
4344           || !DR_STEP (dr)))
4345     {
4346       struct data_reference *newdr
4347         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4348                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4349       if (DR_BASE_ADDRESS (newdr)
4350           && DR_OFFSET (newdr)
4351           && DR_INIT (newdr)
4352           && DR_STEP (newdr)
4353           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4354           && integer_zerop (DR_STEP (newdr)))
4355         {
4356           tree base_address = DR_BASE_ADDRESS (newdr);
4357           tree off = DR_OFFSET (newdr);
4358           tree step = ssize_int (1);
4359           if (integer_zerop (off)
4360               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4361             {
4362               off = TREE_OPERAND (base_address, 1);
4363               base_address = TREE_OPERAND (base_address, 0);
4364             }
4365           STRIP_NOPS (off);
4366           if (TREE_CODE (off) == MULT_EXPR
4367               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4368             {
4369               step = TREE_OPERAND (off, 1);
4370               off = TREE_OPERAND (off, 0);
4371               STRIP_NOPS (off);
4372             }
4373           if (CONVERT_EXPR_P (off)
4374               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4375                   < TYPE_PRECISION (TREE_TYPE (off))))
4376             off = TREE_OPERAND (off, 0);
4377           if (TREE_CODE (off) == SSA_NAME)
4378             {
4379               gimple *def = SSA_NAME_DEF_STMT (off);
4380               /* Look through widening conversion.  */
4381               if (is_gimple_assign (def)
4382                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4383                 {
4384                   tree rhs1 = gimple_assign_rhs1 (def);
4385                   if (TREE_CODE (rhs1) == SSA_NAME
4386                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4387                       && (TYPE_PRECISION (TREE_TYPE (off))
4388                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4389                     def = SSA_NAME_DEF_STMT (rhs1);
4390                 }
4391               if (is_gimple_call (def)
4392                   && gimple_call_internal_p (def)
4393                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4394                 {
4395                   tree arg = gimple_call_arg (def, 0);
4396                   tree reft = TREE_TYPE (DR_REF (newdr));
4397                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4398                   arg = SSA_NAME_VAR (arg);
4399                   if (arg == loop->simduid
4400                       /* For now.  */
4401                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4402                     {
4403                       DR_BASE_ADDRESS (newdr) = base_address;
4404                       DR_OFFSET (newdr) = ssize_int (0);
4405                       DR_STEP (newdr) = step;
4406                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4407                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4408                       /* Mark as simd-lane access.  */
4409                       tree arg2 = gimple_call_arg (def, 1);
4410                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4411                       free_data_ref (dr);
4412                       datarefs->safe_push (newdr);
4413                       if (dataref_groups)
4414                         dataref_groups->safe_push (group_id);
4415                       return opt_result::success ();
4416                     }
4417                 }
4418             }
4419         }
4420       free_data_ref (newdr);
4421     }
4422
4423   datarefs->safe_push (dr);
4424   if (dataref_groups)
4425     dataref_groups->safe_push (group_id);
4426   return opt_result::success ();
4427 }
4428
4429 /* Function vect_analyze_data_refs.
4430
4431   Find all the data references in the loop or basic block.
4432
4433    The general structure of the analysis of data refs in the vectorizer is as
4434    follows:
4435    1- vect_analyze_data_refs(loop/bb): call
4436       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4437       in the loop/bb and their dependences.
4438    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4439    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4440    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4441
4442 */
4443
4444 opt_result
4445 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4446 {
4447   class loop *loop = NULL;
4448   unsigned int i;
4449   struct data_reference *dr;
4450   tree scalar_type;
4451
4452   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4453
4454   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4455     loop = LOOP_VINFO_LOOP (loop_vinfo);
4456
4457   /* Go through the data-refs, check that the analysis succeeded.  Update
4458      pointer from stmt_vec_info struct to DR and vectype.  */
4459
4460   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4461   FOR_EACH_VEC_ELT (datarefs, i, dr)
4462     {
4463       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4464       poly_uint64 vf;
4465
4466       gcc_assert (DR_REF (dr));
4467       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4468       gcc_assert (!stmt_info->dr_aux.dr);
4469       stmt_info->dr_aux.dr = dr;
4470       stmt_info->dr_aux.stmt = stmt_info;
4471
4472       /* Check that analysis of the data-ref succeeded.  */
4473       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4474           || !DR_STEP (dr))
4475         {
4476           bool maybe_gather
4477             = DR_IS_READ (dr)
4478               && !TREE_THIS_VOLATILE (DR_REF (dr));
4479           bool maybe_scatter
4480             = DR_IS_WRITE (dr)
4481               && !TREE_THIS_VOLATILE (DR_REF (dr));
4482
4483           /* If target supports vector gather loads or scatter stores,
4484              see if they can't be used.  */
4485           if (is_a <loop_vec_info> (vinfo)
4486               && !nested_in_vect_loop_p (loop, stmt_info))
4487             {
4488               if (maybe_gather || maybe_scatter)
4489                 {
4490                   if (maybe_gather)
4491                     gatherscatter = GATHER;
4492                   else
4493                     gatherscatter = SCATTER;
4494                 }
4495             }
4496
4497           if (gatherscatter == SG_NONE)
4498             {
4499               if (dump_enabled_p ())
4500                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4501                                  "not vectorized: data ref analysis "
4502                                  "failed %G", stmt_info->stmt);
4503               if (is_a <bb_vec_info> (vinfo))
4504                 {
4505                   /* In BB vectorization the ref can still participate
4506                      in dependence analysis, we just can't vectorize it.  */
4507                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4508                   continue;
4509                 }
4510               return opt_result::failure_at (stmt_info->stmt,
4511                                              "not vectorized:"
4512                                              " data ref analysis failed: %G",
4513                                              stmt_info->stmt);
4514             }
4515         }
4516
4517       /* See if this was detected as SIMD lane access.  */
4518       if (dr->aux == (void *)-1
4519           || dr->aux == (void *)-2
4520           || dr->aux == (void *)-3
4521           || dr->aux == (void *)-4)
4522         {
4523           if (nested_in_vect_loop_p (loop, stmt_info))
4524             return opt_result::failure_at (stmt_info->stmt,
4525                                            "not vectorized:"
4526                                            " data ref analysis failed: %G",
4527                                            stmt_info->stmt);
4528           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4529             = -(uintptr_t) dr->aux;
4530         }
4531
4532       tree base = get_base_address (DR_REF (dr));
4533       if (base && VAR_P (base) && DECL_NONALIASED (base))
4534         {
4535           if (dump_enabled_p ())
4536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4537                              "not vectorized: base object not addressable "
4538                              "for stmt: %G", stmt_info->stmt);
4539           if (is_a <bb_vec_info> (vinfo))
4540             {
4541               /* In BB vectorization the ref can still participate
4542                  in dependence analysis, we just can't vectorize it.  */
4543               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4544               continue;
4545             }
4546           return opt_result::failure_at (stmt_info->stmt,
4547                                          "not vectorized: base object not"
4548                                          " addressable for stmt: %G",
4549                                          stmt_info->stmt);
4550         }
4551
4552       if (is_a <loop_vec_info> (vinfo)
4553           && DR_STEP (dr)
4554           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4555         {
4556           if (nested_in_vect_loop_p (loop, stmt_info))
4557             return opt_result::failure_at (stmt_info->stmt,
4558                                            "not vectorized: "
4559                                            "not suitable for strided load %G",
4560                                            stmt_info->stmt);
4561           STMT_VINFO_STRIDED_P (stmt_info) = true;
4562         }
4563
4564       /* Update DR field in stmt_vec_info struct.  */
4565
4566       /* If the dataref is in an inner-loop of the loop that is considered for
4567          for vectorization, we also want to analyze the access relative to
4568          the outer-loop (DR contains information only relative to the
4569          inner-most enclosing loop).  We do that by building a reference to the
4570          first location accessed by the inner-loop, and analyze it relative to
4571          the outer-loop.  */
4572       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4573         {
4574           /* Build a reference to the first location accessed by the
4575              inner loop: *(BASE + INIT + OFFSET).  By construction,
4576              this address must be invariant in the inner loop, so we
4577              can consider it as being used in the outer loop.  */
4578           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4579           tree offset = unshare_expr (DR_OFFSET (dr));
4580           tree init = unshare_expr (DR_INIT (dr));
4581           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4582                                           init, offset);
4583           tree init_addr = fold_build_pointer_plus (base, init_offset);
4584           tree init_ref = build_fold_indirect_ref (init_addr);
4585
4586           if (dump_enabled_p ())
4587             dump_printf_loc (MSG_NOTE, vect_location,
4588                              "analyze in outer loop: %T\n", init_ref);
4589
4590           opt_result res
4591             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4592                                     init_ref, loop, stmt_info->stmt);
4593           if (!res)
4594             /* dr_analyze_innermost already explained the failure.  */
4595             return res;
4596
4597           if (dump_enabled_p ())
4598             dump_printf_loc (MSG_NOTE, vect_location,
4599                              "\touter base_address: %T\n"
4600                              "\touter offset from base address: %T\n"
4601                              "\touter constant offset from base address: %T\n"
4602                              "\touter step: %T\n"
4603                              "\touter base alignment: %d\n\n"
4604                              "\touter base misalignment: %d\n"
4605                              "\touter offset alignment: %d\n"
4606                              "\touter step alignment: %d\n",
4607                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4608                              STMT_VINFO_DR_OFFSET (stmt_info),
4609                              STMT_VINFO_DR_INIT (stmt_info),
4610                              STMT_VINFO_DR_STEP (stmt_info),
4611                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4612                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4613                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4614                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4615         }
4616
4617       /* Set vectype for STMT.  */
4618       scalar_type = TREE_TYPE (DR_REF (dr));
4619       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4620       if (!vectype)
4621         {
4622           if (dump_enabled_p ())
4623             {
4624               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4625                                "not vectorized: no vectype for stmt: %G",
4626                                stmt_info->stmt);
4627               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4628               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4629                                  scalar_type);
4630               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4631             }
4632
4633           if (is_a <bb_vec_info> (vinfo))
4634             {
4635               /* No vector type is fine, the ref can still participate
4636                  in dependence analysis, we just can't vectorize it.  */
4637               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4638               continue;
4639             }
4640           if (fatal)
4641             *fatal = false;
4642           return opt_result::failure_at (stmt_info->stmt,
4643                                          "not vectorized:"
4644                                          " no vectype for stmt: %G"
4645                                          " scalar_type: %T\n",
4646                                          stmt_info->stmt, scalar_type);
4647         }
4648       else
4649         {
4650           if (dump_enabled_p ())
4651             dump_printf_loc (MSG_NOTE, vect_location,
4652                              "got vectype for stmt: %G%T\n",
4653                              stmt_info->stmt, vectype);
4654         }
4655
4656       /* Adjust the minimal vectorization factor according to the
4657          vector type.  */
4658       vf = TYPE_VECTOR_SUBPARTS (vectype);
4659       *min_vf = upper_bound (*min_vf, vf);
4660
4661       /* Leave the BB vectorizer to pick the vector type later, based on
4662          the final dataref group size and SLP node size.  */
4663       if (is_a <loop_vec_info> (vinfo))
4664         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4665
4666       if (gatherscatter != SG_NONE)
4667         {
4668           gather_scatter_info gs_info;
4669           if (!vect_check_gather_scatter (stmt_info,
4670                                           as_a <loop_vec_info> (vinfo),
4671                                           &gs_info)
4672               || !get_vectype_for_scalar_type (vinfo,
4673                                                TREE_TYPE (gs_info.offset)))
4674             {
4675               if (fatal)
4676                 *fatal = false;
4677               return opt_result::failure_at
4678                         (stmt_info->stmt,
4679                          (gatherscatter == GATHER)
4680                          ? "not vectorized: not suitable for gather load %G"
4681                          : "not vectorized: not suitable for scatter store %G",
4682                          stmt_info->stmt);
4683             }
4684           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4685         }
4686     }
4687
4688   /* We used to stop processing and prune the list here.  Verify we no
4689      longer need to.  */
4690   gcc_assert (i == datarefs.length ());
4691
4692   return opt_result::success ();
4693 }
4694
4695
4696 /* Function vect_get_new_vect_var.
4697
4698    Returns a name for a new variable.  The current naming scheme appends the
4699    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4700    the name of vectorizer generated variables, and appends that to NAME if
4701    provided.  */
4702
4703 tree
4704 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4705 {
4706   const char *prefix;
4707   tree new_vect_var;
4708
4709   switch (var_kind)
4710   {
4711   case vect_simple_var:
4712     prefix = "vect";
4713     break;
4714   case vect_scalar_var:
4715     prefix = "stmp";
4716     break;
4717   case vect_mask_var:
4718     prefix = "mask";
4719     break;
4720   case vect_pointer_var:
4721     prefix = "vectp";
4722     break;
4723   default:
4724     gcc_unreachable ();
4725   }
4726
4727   if (name)
4728     {
4729       char* tmp = concat (prefix, "_", name, NULL);
4730       new_vect_var = create_tmp_reg (type, tmp);
4731       free (tmp);
4732     }
4733   else
4734     new_vect_var = create_tmp_reg (type, prefix);
4735
4736   return new_vect_var;
4737 }
4738
4739 /* Like vect_get_new_vect_var but return an SSA name.  */
4740
4741 tree
4742 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4743 {
4744   const char *prefix;
4745   tree new_vect_var;
4746
4747   switch (var_kind)
4748   {
4749   case vect_simple_var:
4750     prefix = "vect";
4751     break;
4752   case vect_scalar_var:
4753     prefix = "stmp";
4754     break;
4755   case vect_pointer_var:
4756     prefix = "vectp";
4757     break;
4758   default:
4759     gcc_unreachable ();
4760   }
4761
4762   if (name)
4763     {
4764       char* tmp = concat (prefix, "_", name, NULL);
4765       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4766       free (tmp);
4767     }
4768   else
4769     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4770
4771   return new_vect_var;
4772 }
4773
4774 /* Duplicate points-to info on NAME from DR_INFO.  */
4775
4776 static void
4777 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4778 {
4779   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4780   /* DR_PTR_INFO is for a base SSA name, not including constant or
4781      variable offsets in the ref so its alignment info does not apply.  */
4782   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4783 }
4784
4785 /* Function vect_create_addr_base_for_vector_ref.
4786
4787    Create an expression that computes the address of the first memory location
4788    that will be accessed for a data reference.
4789
4790    Input:
4791    STMT_INFO: The statement containing the data reference.
4792    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4793    OFFSET: Optional. If supplied, it is be added to the initial address.
4794    LOOP:    Specify relative to which loop-nest should the address be computed.
4795             For example, when the dataref is in an inner-loop nested in an
4796             outer-loop that is now being vectorized, LOOP can be either the
4797             outer-loop, or the inner-loop.  The first memory location accessed
4798             by the following dataref ('in' points to short):
4799
4800                 for (i=0; i<N; i++)
4801                    for (j=0; j<M; j++)
4802                      s += in[i+j]
4803
4804             is as follows:
4805             if LOOP=i_loop:     &in             (relative to i_loop)
4806             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4807
4808    Output:
4809    1. Return an SSA_NAME whose value is the address of the memory location of
4810       the first vector of the data reference.
4811    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4812       these statement(s) which define the returned SSA_NAME.
4813
4814    FORNOW: We are only handling array accesses with step 1.  */
4815
4816 tree
4817 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4818                                       gimple_seq *new_stmt_list,
4819                                       tree offset)
4820 {
4821   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4822   struct data_reference *dr = dr_info->dr;
4823   const char *base_name;
4824   tree addr_base;
4825   tree dest;
4826   gimple_seq seq = NULL;
4827   tree vect_ptr_type;
4828   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4829   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4830
4831   tree data_ref_base = unshare_expr (drb->base_address);
4832   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4833   tree init = unshare_expr (drb->init);
4834
4835   if (loop_vinfo)
4836     base_name = get_name (data_ref_base);
4837   else
4838     {
4839       base_offset = ssize_int (0);
4840       init = ssize_int (0);
4841       base_name = get_name (DR_REF (dr));
4842     }
4843
4844   /* Create base_offset */
4845   base_offset = size_binop (PLUS_EXPR,
4846                             fold_convert (sizetype, base_offset),
4847                             fold_convert (sizetype, init));
4848
4849   if (offset)
4850     {
4851       offset = fold_convert (sizetype, offset);
4852       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4853                                  base_offset, offset);
4854     }
4855
4856   /* base + base_offset */
4857   if (loop_vinfo)
4858     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4859   else
4860     addr_base = build1 (ADDR_EXPR,
4861                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
4862                         /* Strip zero offset components since we don't need
4863                            them and they can confuse late diagnostics if
4864                            we CSE them wrongly.  See PR106904 for example.  */
4865                         unshare_expr (strip_zero_offset_components
4866                                                                 (DR_REF (dr))));
4867
4868   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4869   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4870   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4871   gimple_seq_add_seq (new_stmt_list, seq);
4872
4873   if (DR_PTR_INFO (dr)
4874       && TREE_CODE (addr_base) == SSA_NAME
4875       /* We should only duplicate pointer info to newly created SSA names.  */
4876       && SSA_NAME_VAR (addr_base) == dest)
4877     {
4878       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4879       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4880     }
4881
4882   if (dump_enabled_p ())
4883     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4884
4885   return addr_base;
4886 }
4887
4888
4889 /* Function vect_create_data_ref_ptr.
4890
4891    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4892    location accessed in the loop by STMT_INFO, along with the def-use update
4893    chain to appropriately advance the pointer through the loop iterations.
4894    Also set aliasing information for the pointer.  This pointer is used by
4895    the callers to this function to create a memory reference expression for
4896    vector load/store access.
4897
4898    Input:
4899    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4900          GIMPLE_ASSIGN <name, data-ref> or
4901          GIMPLE_ASSIGN <data-ref, name>.
4902    2. AGGR_TYPE: the type of the reference, which should be either a vector
4903         or an array.
4904    3. AT_LOOP: the loop where the vector memref is to be created.
4905    4. OFFSET (optional): a byte offset to be added to the initial address
4906         accessed by the data-ref in STMT_INFO.
4907    5. BSI: location where the new stmts are to be placed if there is no loop
4908    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4909         pointing to the initial address.
4910    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4911         to the IV during each iteration of the loop.  NULL says to move
4912         by one copy of AGGR_TYPE up or down, depending on the step of the
4913         data reference.
4914
4915    Output:
4916    1. Declare a new ptr to vector_type, and have it point to the base of the
4917       data reference (initial addressed accessed by the data reference).
4918       For example, for vector of type V8HI, the following code is generated:
4919
4920       v8hi *ap;
4921       ap = (v8hi *)initial_address;
4922
4923       if OFFSET is not supplied:
4924          initial_address = &a[init];
4925       if OFFSET is supplied:
4926          initial_address = &a[init] + OFFSET;
4927       if BYTE_OFFSET is supplied:
4928          initial_address = &a[init] + BYTE_OFFSET;
4929
4930       Return the initial_address in INITIAL_ADDRESS.
4931
4932    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4933       update the pointer in each iteration of the loop.
4934
4935       Return the increment stmt that updates the pointer in PTR_INCR.
4936
4937    3. Return the pointer.  */
4938
4939 tree
4940 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4941                           tree aggr_type, class loop *at_loop, tree offset,
4942                           tree *initial_address, gimple_stmt_iterator *gsi,
4943                           gimple **ptr_incr, bool only_init,
4944                           tree iv_step)
4945 {
4946   const char *base_name;
4947   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4948   class loop *loop = NULL;
4949   bool nested_in_vect_loop = false;
4950   class loop *containing_loop = NULL;
4951   tree aggr_ptr_type;
4952   tree aggr_ptr;
4953   tree new_temp;
4954   gimple_seq new_stmt_list = NULL;
4955   edge pe = NULL;
4956   basic_block new_bb;
4957   tree aggr_ptr_init;
4958   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4959   struct data_reference *dr = dr_info->dr;
4960   tree aptr;
4961   gimple_stmt_iterator incr_gsi;
4962   bool insert_after;
4963   tree indx_before_incr, indx_after_incr;
4964   gimple *incr;
4965   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4966
4967   gcc_assert (iv_step != NULL_TREE
4968               || TREE_CODE (aggr_type) == ARRAY_TYPE
4969               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4970
4971   if (loop_vinfo)
4972     {
4973       loop = LOOP_VINFO_LOOP (loop_vinfo);
4974       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4975       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4976       pe = loop_preheader_edge (loop);
4977     }
4978   else
4979     {
4980       gcc_assert (bb_vinfo);
4981       only_init = true;
4982       *ptr_incr = NULL;
4983     }
4984
4985   /* Create an expression for the first address accessed by this load
4986      in LOOP.  */
4987   base_name = get_name (DR_BASE_ADDRESS (dr));
4988
4989   if (dump_enabled_p ())
4990     {
4991       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4992       dump_printf_loc (MSG_NOTE, vect_location,
4993                        "create %s-pointer variable to type: %T",
4994                        get_tree_code_name (TREE_CODE (aggr_type)),
4995                        aggr_type);
4996       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4997         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4998       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4999         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5000       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5001         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5002       else
5003         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5004       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5005     }
5006
5007   /* (1) Create the new aggregate-pointer variable.
5008      Vector and array types inherit the alias set of their component
5009      type by default so we need to use a ref-all pointer if the data
5010      reference does not conflict with the created aggregated data
5011      reference because it is not addressable.  */
5012   bool need_ref_all = false;
5013   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5014                               get_alias_set (DR_REF (dr))))
5015     need_ref_all = true;
5016   /* Likewise for any of the data references in the stmt group.  */
5017   else if (DR_GROUP_SIZE (stmt_info) > 1)
5018     {
5019       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5020       do
5021         {
5022           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5023           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5024                                       get_alias_set (DR_REF (sdr))))
5025             {
5026               need_ref_all = true;
5027               break;
5028             }
5029           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5030         }
5031       while (sinfo);
5032     }
5033   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5034                                                need_ref_all);
5035   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5036
5037
5038   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5039      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5040      def-use update cycles for the pointer: one relative to the outer-loop
5041      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5042      to the inner-loop (which is the inner-most loop containing the dataref),
5043      and this is done be step (5) below.
5044
5045      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5046      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5047      redundant.  Steps (3),(4) create the following:
5048
5049         vp0 = &base_addr;
5050         LOOP:   vp1 = phi(vp0,vp2)
5051                 ...
5052                 ...
5053                 vp2 = vp1 + step
5054                 goto LOOP
5055
5056      If there is an inner-loop nested in loop, then step (5) will also be
5057      applied, and an additional update in the inner-loop will be created:
5058
5059         vp0 = &base_addr;
5060         LOOP:   vp1 = phi(vp0,vp2)
5061                 ...
5062         inner:     vp3 = phi(vp1,vp4)
5063                    vp4 = vp3 + inner_step
5064                    if () goto inner
5065                 ...
5066                 vp2 = vp1 + step
5067                 if () goto LOOP   */
5068
5069   /* (2) Calculate the initial address of the aggregate-pointer, and set
5070      the aggregate-pointer to point to it before the loop.  */
5071
5072   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5073
5074   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5075                                                    stmt_info, &new_stmt_list,
5076                                                    offset);
5077   if (new_stmt_list)
5078     {
5079       if (pe)
5080         {
5081           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5082           gcc_assert (!new_bb);
5083         }
5084       else
5085         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5086     }
5087
5088   *initial_address = new_temp;
5089   aggr_ptr_init = new_temp;
5090
5091   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5092      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5093      inner-loop nested in LOOP (during outer-loop vectorization).  */
5094
5095   /* No update in loop is required.  */
5096   if (only_init && (!loop_vinfo || at_loop == loop))
5097     aptr = aggr_ptr_init;
5098   else
5099     {
5100       /* Accesses to invariant addresses should be handled specially
5101          by the caller.  */
5102       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5103       gcc_assert (!integer_zerop (step));
5104
5105       if (iv_step == NULL_TREE)
5106         {
5107           /* The step of the aggregate pointer is the type size,
5108              negated for downward accesses.  */
5109           iv_step = TYPE_SIZE_UNIT (aggr_type);
5110           if (tree_int_cst_sgn (step) == -1)
5111             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5112         }
5113
5114       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5115
5116       create_iv (aggr_ptr_init, PLUS_EXPR,
5117                  fold_convert (aggr_ptr_type, iv_step),
5118                  aggr_ptr, loop, &incr_gsi, insert_after,
5119                  &indx_before_incr, &indx_after_incr);
5120       incr = gsi_stmt (incr_gsi);
5121
5122       /* Copy the points-to information if it exists. */
5123       if (DR_PTR_INFO (dr))
5124         {
5125           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5126           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5127         }
5128       if (ptr_incr)
5129         *ptr_incr = incr;
5130
5131       aptr = indx_before_incr;
5132     }
5133
5134   if (!nested_in_vect_loop || only_init)
5135     return aptr;
5136
5137
5138   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5139      nested in LOOP, if exists.  */
5140
5141   gcc_assert (nested_in_vect_loop);
5142   if (!only_init)
5143     {
5144       standard_iv_increment_position (containing_loop, &incr_gsi,
5145                                       &insert_after);
5146       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5147                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5148                  &indx_before_incr, &indx_after_incr);
5149       incr = gsi_stmt (incr_gsi);
5150
5151       /* Copy the points-to information if it exists. */
5152       if (DR_PTR_INFO (dr))
5153         {
5154           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5155           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5156         }
5157       if (ptr_incr)
5158         *ptr_incr = incr;
5159
5160       return indx_before_incr;
5161     }
5162   else
5163     gcc_unreachable ();
5164 }
5165
5166
5167 /* Function bump_vector_ptr
5168
5169    Increment a pointer (to a vector type) by vector-size. If requested,
5170    i.e. if PTR-INCR is given, then also connect the new increment stmt
5171    to the existing def-use update-chain of the pointer, by modifying
5172    the PTR_INCR as illustrated below:
5173
5174    The pointer def-use update-chain before this function:
5175                         DATAREF_PTR = phi (p_0, p_2)
5176                         ....
5177         PTR_INCR:       p_2 = DATAREF_PTR + step
5178
5179    The pointer def-use update-chain after this function:
5180                         DATAREF_PTR = phi (p_0, p_2)
5181                         ....
5182                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5183                         ....
5184         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5185
5186    Input:
5187    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5188                  in the loop.
5189    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5190               the loop.  The increment amount across iterations is expected
5191               to be vector_size.
5192    BSI - location where the new update stmt is to be placed.
5193    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5194    BUMP - optional. The offset by which to bump the pointer. If not given,
5195           the offset is assumed to be vector_size.
5196
5197    Output: Return NEW_DATAREF_PTR as illustrated above.
5198
5199 */
5200
5201 tree
5202 bump_vector_ptr (vec_info *vinfo,
5203                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5204                  stmt_vec_info stmt_info, tree bump)
5205 {
5206   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5207   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5208   tree update = TYPE_SIZE_UNIT (vectype);
5209   gimple *incr_stmt;
5210   ssa_op_iter iter;
5211   use_operand_p use_p;
5212   tree new_dataref_ptr;
5213
5214   if (bump)
5215     update = bump;
5216
5217   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5218     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5219   else if (is_gimple_min_invariant (dataref_ptr))
5220     /* When possible avoid emitting a separate increment stmt that will
5221        force the addressed object addressable.  */
5222     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5223                    fold_build2 (MEM_REF,
5224                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5225                                 dataref_ptr,
5226                                 fold_convert (ptr_type_node, update)));
5227   else
5228     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5229   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5230                                    dataref_ptr, update);
5231   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5232   /* Fold the increment, avoiding excessive chains use-def chains of
5233      those, leading to compile-time issues for passes until the next
5234      forwprop pass which would do this as well.  */
5235   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5236   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5237     {
5238       incr_stmt = gsi_stmt (fold_gsi);
5239       update_stmt (incr_stmt);
5240     }
5241
5242   /* Copy the points-to information if it exists. */
5243   if (DR_PTR_INFO (dr))
5244     {
5245       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5246       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5247     }
5248
5249   if (!ptr_incr)
5250     return new_dataref_ptr;
5251
5252   /* Update the vector-pointer's cross-iteration increment.  */
5253   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5254     {
5255       tree use = USE_FROM_PTR (use_p);
5256
5257       if (use == dataref_ptr)
5258         SET_USE (use_p, new_dataref_ptr);
5259       else
5260         gcc_assert (operand_equal_p (use, update, 0));
5261     }
5262
5263   return new_dataref_ptr;
5264 }
5265
5266
5267 /* Copy memory reference info such as base/clique from the SRC reference
5268    to the DEST MEM_REF.  */
5269
5270 void
5271 vect_copy_ref_info (tree dest, tree src)
5272 {
5273   if (TREE_CODE (dest) != MEM_REF)
5274     return;
5275
5276   tree src_base = src;
5277   while (handled_component_p (src_base))
5278     src_base = TREE_OPERAND (src_base, 0);
5279   if (TREE_CODE (src_base) != MEM_REF
5280       && TREE_CODE (src_base) != TARGET_MEM_REF)
5281     return;
5282
5283   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5284   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5285 }
5286
5287
5288 /* Function vect_create_destination_var.
5289
5290    Create a new temporary of type VECTYPE.  */
5291
5292 tree
5293 vect_create_destination_var (tree scalar_dest, tree vectype)
5294 {
5295   tree vec_dest;
5296   const char *name;
5297   char *new_name;
5298   tree type;
5299   enum vect_var_kind kind;
5300
5301   kind = vectype
5302     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5303     ? vect_mask_var
5304     : vect_simple_var
5305     : vect_scalar_var;
5306   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5307
5308   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5309
5310   name = get_name (scalar_dest);
5311   if (name)
5312     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5313   else
5314     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5315   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5316   free (new_name);
5317
5318   return vec_dest;
5319 }
5320
5321 /* Function vect_grouped_store_supported.
5322
5323    Returns TRUE if interleave high and interleave low permutations
5324    are supported, and FALSE otherwise.  */
5325
5326 bool
5327 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5328 {
5329   machine_mode mode = TYPE_MODE (vectype);
5330
5331   /* vect_permute_store_chain requires the group size to be equal to 3 or
5332      be a power of two.  */
5333   if (count != 3 && exact_log2 (count) == -1)
5334     {
5335       if (dump_enabled_p ())
5336         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5337                          "the size of the group of accesses"
5338                          " is not a power of 2 or not eqaul to 3\n");
5339       return false;
5340     }
5341
5342   /* Check that the permutation is supported.  */
5343   if (VECTOR_MODE_P (mode))
5344     {
5345       unsigned int i;
5346       if (count == 3)
5347         {
5348           unsigned int j0 = 0, j1 = 0, j2 = 0;
5349           unsigned int i, j;
5350
5351           unsigned int nelt;
5352           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5353             {
5354               if (dump_enabled_p ())
5355                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5356                                  "cannot handle groups of 3 stores for"
5357                                  " variable-length vectors\n");
5358               return false;
5359             }
5360
5361           vec_perm_builder sel (nelt, nelt, 1);
5362           sel.quick_grow (nelt);
5363           vec_perm_indices indices;
5364           for (j = 0; j < 3; j++)
5365             {
5366               int nelt0 = ((3 - j) * nelt) % 3;
5367               int nelt1 = ((3 - j) * nelt + 1) % 3;
5368               int nelt2 = ((3 - j) * nelt + 2) % 3;
5369               for (i = 0; i < nelt; i++)
5370                 {
5371                   if (3 * i + nelt0 < nelt)
5372                     sel[3 * i + nelt0] = j0++;
5373                   if (3 * i + nelt1 < nelt)
5374                     sel[3 * i + nelt1] = nelt + j1++;
5375                   if (3 * i + nelt2 < nelt)
5376                     sel[3 * i + nelt2] = 0;
5377                 }
5378               indices.new_vector (sel, 2, nelt);
5379               if (!can_vec_perm_const_p (mode, mode, indices))
5380                 {
5381                   if (dump_enabled_p ())
5382                     dump_printf (MSG_MISSED_OPTIMIZATION,
5383                                  "permutation op not supported by target.\n");
5384                   return false;
5385                 }
5386
5387               for (i = 0; i < nelt; i++)
5388                 {
5389                   if (3 * i + nelt0 < nelt)
5390                     sel[3 * i + nelt0] = 3 * i + nelt0;
5391                   if (3 * i + nelt1 < nelt)
5392                     sel[3 * i + nelt1] = 3 * i + nelt1;
5393                   if (3 * i + nelt2 < nelt)
5394                     sel[3 * i + nelt2] = nelt + j2++;
5395                 }
5396               indices.new_vector (sel, 2, nelt);
5397               if (!can_vec_perm_const_p (mode, mode, indices))
5398                 {
5399                   if (dump_enabled_p ())
5400                     dump_printf (MSG_MISSED_OPTIMIZATION,
5401                                  "permutation op not supported by target.\n");
5402                   return false;
5403                 }
5404             }
5405           return true;
5406         }
5407       else
5408         {
5409           /* If length is not equal to 3 then only power of 2 is supported.  */
5410           gcc_assert (pow2p_hwi (count));
5411           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5412
5413           /* The encoding has 2 interleaved stepped patterns.  */
5414           if(!multiple_p (nelt, 2))
5415             return false;
5416           vec_perm_builder sel (nelt, 2, 3);
5417           sel.quick_grow (6);
5418           for (i = 0; i < 3; i++)
5419             {
5420               sel[i * 2] = i;
5421               sel[i * 2 + 1] = i + nelt;
5422             }
5423           vec_perm_indices indices (sel, 2, nelt);
5424           if (can_vec_perm_const_p (mode, mode, indices))
5425             {
5426               for (i = 0; i < 6; i++)
5427                 sel[i] += exact_div (nelt, 2);
5428               indices.new_vector (sel, 2, nelt);
5429               if (can_vec_perm_const_p (mode, mode, indices))
5430                 return true;
5431             }
5432         }
5433     }
5434
5435   if (dump_enabled_p ())
5436     dump_printf (MSG_MISSED_OPTIMIZATION,
5437                  "permutation op not supported by target.\n");
5438   return false;
5439 }
5440
5441
5442 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5443    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5444
5445 bool
5446 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5447                             bool masked_p)
5448 {
5449   if (masked_p)
5450     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5451                                          vec_mask_store_lanes_optab,
5452                                          vectype, count);
5453   else
5454     return vect_lanes_optab_supported_p ("vec_store_lanes",
5455                                          vec_store_lanes_optab,
5456                                          vectype, count);
5457 }
5458
5459
5460 /* Function vect_permute_store_chain.
5461
5462    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5463    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5464    the data correctly for the stores.  Return the final references for stores
5465    in RESULT_CHAIN.
5466
5467    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5468    The input is 4 vectors each containing 8 elements.  We assign a number to
5469    each element, the input sequence is:
5470
5471    1st vec:   0  1  2  3  4  5  6  7
5472    2nd vec:   8  9 10 11 12 13 14 15
5473    3rd vec:  16 17 18 19 20 21 22 23
5474    4th vec:  24 25 26 27 28 29 30 31
5475
5476    The output sequence should be:
5477
5478    1st vec:  0  8 16 24  1  9 17 25
5479    2nd vec:  2 10 18 26  3 11 19 27
5480    3rd vec:  4 12 20 28  5 13 21 30
5481    4th vec:  6 14 22 30  7 15 23 31
5482
5483    i.e., we interleave the contents of the four vectors in their order.
5484
5485    We use interleave_high/low instructions to create such output.  The input of
5486    each interleave_high/low operation is two vectors:
5487    1st vec    2nd vec
5488    0 1 2 3    4 5 6 7
5489    the even elements of the result vector are obtained left-to-right from the
5490    high/low elements of the first vector.  The odd elements of the result are
5491    obtained left-to-right from the high/low elements of the second vector.
5492    The output of interleave_high will be:   0 4 1 5
5493    and of interleave_low:                   2 6 3 7
5494
5495
5496    The permutation is done in log LENGTH stages.  In each stage interleave_high
5497    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5498    where the first argument is taken from the first half of DR_CHAIN and the
5499    second argument from it's second half.
5500    In our example,
5501
5502    I1: interleave_high (1st vec, 3rd vec)
5503    I2: interleave_low (1st vec, 3rd vec)
5504    I3: interleave_high (2nd vec, 4th vec)
5505    I4: interleave_low (2nd vec, 4th vec)
5506
5507    The output for the first stage is:
5508
5509    I1:  0 16  1 17  2 18  3 19
5510    I2:  4 20  5 21  6 22  7 23
5511    I3:  8 24  9 25 10 26 11 27
5512    I4: 12 28 13 29 14 30 15 31
5513
5514    The output of the second stage, i.e. the final result is:
5515
5516    I1:  0  8 16 24  1  9 17 25
5517    I2:  2 10 18 26  3 11 19 27
5518    I3:  4 12 20 28  5 13 21 30
5519    I4:  6 14 22 30  7 15 23 31.  */
5520
5521 void
5522 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5523                           unsigned int length,
5524                           stmt_vec_info stmt_info,
5525                           gimple_stmt_iterator *gsi,
5526                           vec<tree> *result_chain)
5527 {
5528   tree vect1, vect2, high, low;
5529   gimple *perm_stmt;
5530   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5531   tree perm_mask_low, perm_mask_high;
5532   tree data_ref;
5533   tree perm3_mask_low, perm3_mask_high;
5534   unsigned int i, j, n, log_length = exact_log2 (length);
5535
5536   result_chain->quick_grow (length);
5537   memcpy (result_chain->address (), dr_chain.address (),
5538           length * sizeof (tree));
5539
5540   if (length == 3)
5541     {
5542       /* vect_grouped_store_supported ensures that this is constant.  */
5543       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5544       unsigned int j0 = 0, j1 = 0, j2 = 0;
5545
5546       vec_perm_builder sel (nelt, nelt, 1);
5547       sel.quick_grow (nelt);
5548       vec_perm_indices indices;
5549       for (j = 0; j < 3; j++)
5550         {
5551           int nelt0 = ((3 - j) * nelt) % 3;
5552           int nelt1 = ((3 - j) * nelt + 1) % 3;
5553           int nelt2 = ((3 - j) * nelt + 2) % 3;
5554
5555           for (i = 0; i < nelt; i++)
5556             {
5557               if (3 * i + nelt0 < nelt)
5558                 sel[3 * i + nelt0] = j0++;
5559               if (3 * i + nelt1 < nelt)
5560                 sel[3 * i + nelt1] = nelt + j1++;
5561               if (3 * i + nelt2 < nelt)
5562                 sel[3 * i + nelt2] = 0;
5563             }
5564           indices.new_vector (sel, 2, nelt);
5565           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5566
5567           for (i = 0; i < nelt; i++)
5568             {
5569               if (3 * i + nelt0 < nelt)
5570                 sel[3 * i + nelt0] = 3 * i + nelt0;
5571               if (3 * i + nelt1 < nelt)
5572                 sel[3 * i + nelt1] = 3 * i + nelt1;
5573               if (3 * i + nelt2 < nelt)
5574                 sel[3 * i + nelt2] = nelt + j2++;
5575             }
5576           indices.new_vector (sel, 2, nelt);
5577           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5578
5579           vect1 = dr_chain[0];
5580           vect2 = dr_chain[1];
5581
5582           /* Create interleaving stmt:
5583              low = VEC_PERM_EXPR <vect1, vect2,
5584                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5585                                    j + 2, nelt + j + 2, *, ...}>  */
5586           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5587           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5588                                            vect2, perm3_mask_low);
5589           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5590
5591           vect1 = data_ref;
5592           vect2 = dr_chain[2];
5593           /* Create interleaving stmt:
5594              low = VEC_PERM_EXPR <vect1, vect2,
5595                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5596                                    6, 7, nelt + j + 2, ...}>  */
5597           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5598           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5599                                            vect2, perm3_mask_high);
5600           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5601           (*result_chain)[j] = data_ref;
5602         }
5603     }
5604   else
5605     {
5606       /* If length is not equal to 3 then only power of 2 is supported.  */
5607       gcc_assert (pow2p_hwi (length));
5608
5609       /* The encoding has 2 interleaved stepped patterns.  */
5610       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5611       vec_perm_builder sel (nelt, 2, 3);
5612       sel.quick_grow (6);
5613       for (i = 0; i < 3; i++)
5614         {
5615           sel[i * 2] = i;
5616           sel[i * 2 + 1] = i + nelt;
5617         }
5618         vec_perm_indices indices (sel, 2, nelt);
5619         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5620
5621         for (i = 0; i < 6; i++)
5622           sel[i] += exact_div (nelt, 2);
5623         indices.new_vector (sel, 2, nelt);
5624         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5625
5626         for (i = 0, n = log_length; i < n; i++)
5627           {
5628             for (j = 0; j < length/2; j++)
5629               {
5630                 vect1 = dr_chain[j];
5631                 vect2 = dr_chain[j+length/2];
5632
5633                 /* Create interleaving stmt:
5634                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5635                                                         ...}>  */
5636                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5637                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5638                                                  vect2, perm_mask_high);
5639                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5640                 (*result_chain)[2*j] = high;
5641
5642                 /* Create interleaving stmt:
5643                    low = VEC_PERM_EXPR <vect1, vect2,
5644                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5645                                          ...}>  */
5646                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5647                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5648                                                  vect2, perm_mask_low);
5649                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5650                 (*result_chain)[2*j+1] = low;
5651               }
5652             memcpy (dr_chain.address (), result_chain->address (),
5653                     length * sizeof (tree));
5654           }
5655     }
5656 }
5657
5658 /* Function vect_setup_realignment
5659
5660    This function is called when vectorizing an unaligned load using
5661    the dr_explicit_realign[_optimized] scheme.
5662    This function generates the following code at the loop prolog:
5663
5664       p = initial_addr;
5665    x  msq_init = *(floor(p));   # prolog load
5666       realignment_token = call target_builtin;
5667     loop:
5668    x  msq = phi (msq_init, ---)
5669
5670    The stmts marked with x are generated only for the case of
5671    dr_explicit_realign_optimized.
5672
5673    The code above sets up a new (vector) pointer, pointing to the first
5674    location accessed by STMT_INFO, and a "floor-aligned" load using that
5675    pointer.  It also generates code to compute the "realignment-token"
5676    (if the relevant target hook was defined), and creates a phi-node at the
5677    loop-header bb whose arguments are the result of the prolog-load (created
5678    by this function) and the result of a load that takes place in the loop
5679    (to be created by the caller to this function).
5680
5681    For the case of dr_explicit_realign_optimized:
5682    The caller to this function uses the phi-result (msq) to create the
5683    realignment code inside the loop, and sets up the missing phi argument,
5684    as follows:
5685     loop:
5686       msq = phi (msq_init, lsq)
5687       lsq = *(floor(p'));        # load in loop
5688       result = realign_load (msq, lsq, realignment_token);
5689
5690    For the case of dr_explicit_realign:
5691     loop:
5692       msq = *(floor(p));        # load in loop
5693       p' = p + (VS-1);
5694       lsq = *(floor(p'));       # load in loop
5695       result = realign_load (msq, lsq, realignment_token);
5696
5697    Input:
5698    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5699                a memory location that may be unaligned.
5700    BSI - place where new code is to be inserted.
5701    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5702                               is used.
5703
5704    Output:
5705    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5706                        target hook, if defined.
5707    Return value - the result of the loop-header phi node.  */
5708
5709 tree
5710 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5711                         gimple_stmt_iterator *gsi, tree *realignment_token,
5712                         enum dr_alignment_support alignment_support_scheme,
5713                         tree init_addr,
5714                         class loop **at_loop)
5715 {
5716   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5717   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5718   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5719   struct data_reference *dr = dr_info->dr;
5720   class loop *loop = NULL;
5721   edge pe = NULL;
5722   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5723   tree vec_dest;
5724   gimple *inc;
5725   tree ptr;
5726   tree data_ref;
5727   basic_block new_bb;
5728   tree msq_init = NULL_TREE;
5729   tree new_temp;
5730   gphi *phi_stmt;
5731   tree msq = NULL_TREE;
5732   gimple_seq stmts = NULL;
5733   bool compute_in_loop = false;
5734   bool nested_in_vect_loop = false;
5735   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5736   class loop *loop_for_initial_load = NULL;
5737
5738   if (loop_vinfo)
5739     {
5740       loop = LOOP_VINFO_LOOP (loop_vinfo);
5741       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5742     }
5743
5744   gcc_assert (alignment_support_scheme == dr_explicit_realign
5745               || alignment_support_scheme == dr_explicit_realign_optimized);
5746
5747   /* We need to generate three things:
5748      1. the misalignment computation
5749      2. the extra vector load (for the optimized realignment scheme).
5750      3. the phi node for the two vectors from which the realignment is
5751       done (for the optimized realignment scheme).  */
5752
5753   /* 1. Determine where to generate the misalignment computation.
5754
5755      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5756      calculation will be generated by this function, outside the loop (in the
5757      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5758      caller, inside the loop.
5759
5760      Background: If the misalignment remains fixed throughout the iterations of
5761      the loop, then both realignment schemes are applicable, and also the
5762      misalignment computation can be done outside LOOP.  This is because we are
5763      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5764      are a multiple of VS (the Vector Size), and therefore the misalignment in
5765      different vectorized LOOP iterations is always the same.
5766      The problem arises only if the memory access is in an inner-loop nested
5767      inside LOOP, which is now being vectorized using outer-loop vectorization.
5768      This is the only case when the misalignment of the memory access may not
5769      remain fixed throughout the iterations of the inner-loop (as explained in
5770      detail in vect_supportable_dr_alignment).  In this case, not only is the
5771      optimized realignment scheme not applicable, but also the misalignment
5772      computation (and generation of the realignment token that is passed to
5773      REALIGN_LOAD) have to be done inside the loop.
5774
5775      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5776      or not, which in turn determines if the misalignment is computed inside
5777      the inner-loop, or outside LOOP.  */
5778
5779   if (init_addr != NULL_TREE || !loop_vinfo)
5780     {
5781       compute_in_loop = true;
5782       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5783     }
5784
5785
5786   /* 2. Determine where to generate the extra vector load.
5787
5788      For the optimized realignment scheme, instead of generating two vector
5789      loads in each iteration, we generate a single extra vector load in the
5790      preheader of the loop, and in each iteration reuse the result of the
5791      vector load from the previous iteration.  In case the memory access is in
5792      an inner-loop nested inside LOOP, which is now being vectorized using
5793      outer-loop vectorization, we need to determine whether this initial vector
5794      load should be generated at the preheader of the inner-loop, or can be
5795      generated at the preheader of LOOP.  If the memory access has no evolution
5796      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5797      to be generated inside LOOP (in the preheader of the inner-loop).  */
5798
5799   if (nested_in_vect_loop)
5800     {
5801       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5802       bool invariant_in_outerloop =
5803             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5804       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5805     }
5806   else
5807     loop_for_initial_load = loop;
5808   if (at_loop)
5809     *at_loop = loop_for_initial_load;
5810
5811   tree vuse = NULL_TREE;
5812   if (loop_for_initial_load)
5813     {
5814       pe = loop_preheader_edge (loop_for_initial_load);
5815       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
5816         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
5817     }
5818   if (!vuse)
5819     vuse = gimple_vuse (gsi_stmt (*gsi));
5820
5821   /* 3. For the case of the optimized realignment, create the first vector
5822       load at the loop preheader.  */
5823
5824   if (alignment_support_scheme == dr_explicit_realign_optimized)
5825     {
5826       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5827       gassign *new_stmt;
5828
5829       gcc_assert (!compute_in_loop);
5830       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5831       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5832                                       loop_for_initial_load, NULL_TREE,
5833                                       &init_addr, NULL, &inc, true);
5834       if (TREE_CODE (ptr) == SSA_NAME)
5835         new_temp = copy_ssa_name (ptr);
5836       else
5837         new_temp = make_ssa_name (TREE_TYPE (ptr));
5838       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5839       tree type = TREE_TYPE (ptr);
5840       new_stmt = gimple_build_assign
5841                    (new_temp, BIT_AND_EXPR, ptr,
5842                     fold_build2 (MINUS_EXPR, type,
5843                                  build_int_cst (type, 0),
5844                                  build_int_cst (type, align)));
5845       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5846       gcc_assert (!new_bb);
5847       data_ref
5848         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5849                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5850       vect_copy_ref_info (data_ref, DR_REF (dr));
5851       new_stmt = gimple_build_assign (vec_dest, data_ref);
5852       new_temp = make_ssa_name (vec_dest, new_stmt);
5853       gimple_assign_set_lhs (new_stmt, new_temp);
5854       gimple_set_vuse (new_stmt, vuse);
5855       if (pe)
5856         {
5857           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5858           gcc_assert (!new_bb);
5859         }
5860       else
5861          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5862
5863       msq_init = gimple_assign_lhs (new_stmt);
5864     }
5865
5866   /* 4. Create realignment token using a target builtin, if available.
5867       It is done either inside the containing loop, or before LOOP (as
5868       determined above).  */
5869
5870   if (targetm.vectorize.builtin_mask_for_load)
5871     {
5872       gcall *new_stmt;
5873       tree builtin_decl;
5874
5875       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5876       if (!init_addr)
5877         {
5878           /* Generate the INIT_ADDR computation outside LOOP.  */
5879           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5880                                                             stmt_info, &stmts,
5881                                                             NULL_TREE);
5882           if (loop)
5883             {
5884               pe = loop_preheader_edge (loop);
5885               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5886               gcc_assert (!new_bb);
5887             }
5888           else
5889              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5890         }
5891
5892       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5893       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5894       vec_dest =
5895         vect_create_destination_var (scalar_dest,
5896                                      gimple_call_return_type (new_stmt));
5897       new_temp = make_ssa_name (vec_dest, new_stmt);
5898       gimple_call_set_lhs (new_stmt, new_temp);
5899
5900       if (compute_in_loop)
5901         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5902       else
5903         {
5904           /* Generate the misalignment computation outside LOOP.  */
5905           pe = loop_preheader_edge (loop);
5906           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5907           gcc_assert (!new_bb);
5908         }
5909
5910       *realignment_token = gimple_call_lhs (new_stmt);
5911
5912       /* The result of the CALL_EXPR to this builtin is determined from
5913          the value of the parameter and no global variables are touched
5914          which makes the builtin a "const" function.  Requiring the
5915          builtin to have the "const" attribute makes it unnecessary
5916          to call mark_call_clobbered.  */
5917       gcc_assert (TREE_READONLY (builtin_decl));
5918     }
5919
5920   if (alignment_support_scheme == dr_explicit_realign)
5921     return msq;
5922
5923   gcc_assert (!compute_in_loop);
5924   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5925
5926
5927   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5928
5929   pe = loop_preheader_edge (containing_loop);
5930   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5931   msq = make_ssa_name (vec_dest);
5932   phi_stmt = create_phi_node (msq, containing_loop->header);
5933   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5934
5935   return msq;
5936 }
5937
5938
5939 /* Function vect_grouped_load_supported.
5940
5941    COUNT is the size of the load group (the number of statements plus the
5942    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5943    only one statement, with a gap of COUNT - 1.
5944
5945    Returns true if a suitable permute exists.  */
5946
5947 bool
5948 vect_grouped_load_supported (tree vectype, bool single_element_p,
5949                              unsigned HOST_WIDE_INT count)
5950 {
5951   machine_mode mode = TYPE_MODE (vectype);
5952
5953   /* If this is single-element interleaving with an element distance
5954      that leaves unused vector loads around punt - we at least create
5955      very sub-optimal code in that case (and blow up memory,
5956      see PR65518).  */
5957   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5958     {
5959       if (dump_enabled_p ())
5960         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5961                          "single-element interleaving not supported "
5962                          "for not adjacent vector loads\n");
5963       return false;
5964     }
5965
5966   /* vect_permute_load_chain requires the group size to be equal to 3 or
5967      be a power of two.  */
5968   if (count != 3 && exact_log2 (count) == -1)
5969     {
5970       if (dump_enabled_p ())
5971         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5972                          "the size of the group of accesses"
5973                          " is not a power of 2 or not equal to 3\n");
5974       return false;
5975     }
5976
5977   /* Check that the permutation is supported.  */
5978   if (VECTOR_MODE_P (mode))
5979     {
5980       unsigned int i, j;
5981       if (count == 3)
5982         {
5983           unsigned int nelt;
5984           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5985             {
5986               if (dump_enabled_p ())
5987                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5988                                  "cannot handle groups of 3 loads for"
5989                                  " variable-length vectors\n");
5990               return false;
5991             }
5992
5993           vec_perm_builder sel (nelt, nelt, 1);
5994           sel.quick_grow (nelt);
5995           vec_perm_indices indices;
5996           unsigned int k;
5997           for (k = 0; k < 3; k++)
5998             {
5999               for (i = 0; i < nelt; i++)
6000                 if (3 * i + k < 2 * nelt)
6001                   sel[i] = 3 * i + k;
6002                 else
6003                   sel[i] = 0;
6004               indices.new_vector (sel, 2, nelt);
6005               if (!can_vec_perm_const_p (mode, mode, indices))
6006                 {
6007                   if (dump_enabled_p ())
6008                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6009                                      "shuffle of 3 loads is not supported by"
6010                                      " target\n");
6011                   return false;
6012                 }
6013               for (i = 0, j = 0; i < nelt; i++)
6014                 if (3 * i + k < 2 * nelt)
6015                   sel[i] = i;
6016                 else
6017                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6018               indices.new_vector (sel, 2, nelt);
6019               if (!can_vec_perm_const_p (mode, mode, indices))
6020                 {
6021                   if (dump_enabled_p ())
6022                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6023                                      "shuffle of 3 loads is not supported by"
6024                                      " target\n");
6025                   return false;
6026                 }
6027             }
6028           return true;
6029         }
6030       else
6031         {
6032           /* If length is not equal to 3 then only power of 2 is supported.  */
6033           gcc_assert (pow2p_hwi (count));
6034           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6035
6036           /* The encoding has a single stepped pattern.  */
6037           vec_perm_builder sel (nelt, 1, 3);
6038           sel.quick_grow (3);
6039           for (i = 0; i < 3; i++)
6040             sel[i] = i * 2;
6041           vec_perm_indices indices (sel, 2, nelt);
6042           if (can_vec_perm_const_p (mode, mode, indices))
6043             {
6044               for (i = 0; i < 3; i++)
6045                 sel[i] = i * 2 + 1;
6046               indices.new_vector (sel, 2, nelt);
6047               if (can_vec_perm_const_p (mode, mode, indices))
6048                 return true;
6049             }
6050         }
6051     }
6052
6053   if (dump_enabled_p ())
6054     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6055                      "extract even/odd not supported by target\n");
6056   return false;
6057 }
6058
6059 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6060    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6061
6062 bool
6063 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6064                            bool masked_p)
6065 {
6066   if (masked_p)
6067     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6068                                          vec_mask_load_lanes_optab,
6069                                          vectype, count);
6070   else
6071     return vect_lanes_optab_supported_p ("vec_load_lanes",
6072                                          vec_load_lanes_optab,
6073                                          vectype, count);
6074 }
6075
6076 /* Function vect_permute_load_chain.
6077
6078    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6079    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6080    the input data correctly.  Return the final references for loads in
6081    RESULT_CHAIN.
6082
6083    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6084    The input is 4 vectors each containing 8 elements. We assign a number to each
6085    element, the input sequence is:
6086
6087    1st vec:   0  1  2  3  4  5  6  7
6088    2nd vec:   8  9 10 11 12 13 14 15
6089    3rd vec:  16 17 18 19 20 21 22 23
6090    4th vec:  24 25 26 27 28 29 30 31
6091
6092    The output sequence should be:
6093
6094    1st vec:  0 4  8 12 16 20 24 28
6095    2nd vec:  1 5  9 13 17 21 25 29
6096    3rd vec:  2 6 10 14 18 22 26 30
6097    4th vec:  3 7 11 15 19 23 27 31
6098
6099    i.e., the first output vector should contain the first elements of each
6100    interleaving group, etc.
6101
6102    We use extract_even/odd instructions to create such output.  The input of
6103    each extract_even/odd operation is two vectors
6104    1st vec    2nd vec
6105    0 1 2 3    4 5 6 7
6106
6107    and the output is the vector of extracted even/odd elements.  The output of
6108    extract_even will be:   0 2 4 6
6109    and of extract_odd:     1 3 5 7
6110
6111
6112    The permutation is done in log LENGTH stages.  In each stage extract_even
6113    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6114    their order.  In our example,
6115
6116    E1: extract_even (1st vec, 2nd vec)
6117    E2: extract_odd (1st vec, 2nd vec)
6118    E3: extract_even (3rd vec, 4th vec)
6119    E4: extract_odd (3rd vec, 4th vec)
6120
6121    The output for the first stage will be:
6122
6123    E1:  0  2  4  6  8 10 12 14
6124    E2:  1  3  5  7  9 11 13 15
6125    E3: 16 18 20 22 24 26 28 30
6126    E4: 17 19 21 23 25 27 29 31
6127
6128    In order to proceed and create the correct sequence for the next stage (or
6129    for the correct output, if the second stage is the last one, as in our
6130    example), we first put the output of extract_even operation and then the
6131    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6132    The input for the second stage is:
6133
6134    1st vec (E1):  0  2  4  6  8 10 12 14
6135    2nd vec (E3): 16 18 20 22 24 26 28 30
6136    3rd vec (E2):  1  3  5  7  9 11 13 15
6137    4th vec (E4): 17 19 21 23 25 27 29 31
6138
6139    The output of the second stage:
6140
6141    E1: 0 4  8 12 16 20 24 28
6142    E2: 2 6 10 14 18 22 26 30
6143    E3: 1 5  9 13 17 21 25 29
6144    E4: 3 7 11 15 19 23 27 31
6145
6146    And RESULT_CHAIN after reordering:
6147
6148    1st vec (E1):  0 4  8 12 16 20 24 28
6149    2nd vec (E3):  1 5  9 13 17 21 25 29
6150    3rd vec (E2):  2 6 10 14 18 22 26 30
6151    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6152
6153 static void
6154 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6155                          unsigned int length,
6156                          stmt_vec_info stmt_info,
6157                          gimple_stmt_iterator *gsi,
6158                          vec<tree> *result_chain)
6159 {
6160   tree data_ref, first_vect, second_vect;
6161   tree perm_mask_even, perm_mask_odd;
6162   tree perm3_mask_low, perm3_mask_high;
6163   gimple *perm_stmt;
6164   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6165   unsigned int i, j, log_length = exact_log2 (length);
6166
6167   result_chain->quick_grow (length);
6168   memcpy (result_chain->address (), dr_chain.address (),
6169           length * sizeof (tree));
6170
6171   if (length == 3)
6172     {
6173       /* vect_grouped_load_supported ensures that this is constant.  */
6174       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6175       unsigned int k;
6176
6177       vec_perm_builder sel (nelt, nelt, 1);
6178       sel.quick_grow (nelt);
6179       vec_perm_indices indices;
6180       for (k = 0; k < 3; k++)
6181         {
6182           for (i = 0; i < nelt; i++)
6183             if (3 * i + k < 2 * nelt)
6184               sel[i] = 3 * i + k;
6185             else
6186               sel[i] = 0;
6187           indices.new_vector (sel, 2, nelt);
6188           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6189
6190           for (i = 0, j = 0; i < nelt; i++)
6191             if (3 * i + k < 2 * nelt)
6192               sel[i] = i;
6193             else
6194               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6195           indices.new_vector (sel, 2, nelt);
6196           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6197
6198           first_vect = dr_chain[0];
6199           second_vect = dr_chain[1];
6200
6201           /* Create interleaving stmt (low part of):
6202              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6203                                                              ...}>  */
6204           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6205           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6206                                            second_vect, perm3_mask_low);
6207           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6208
6209           /* Create interleaving stmt (high part of):
6210              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6211                                                               ...}>  */
6212           first_vect = data_ref;
6213           second_vect = dr_chain[2];
6214           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6215           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6216                                            second_vect, perm3_mask_high);
6217           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6218           (*result_chain)[k] = data_ref;
6219         }
6220     }
6221   else
6222     {
6223       /* If length is not equal to 3 then only power of 2 is supported.  */
6224       gcc_assert (pow2p_hwi (length));
6225
6226       /* The encoding has a single stepped pattern.  */
6227       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6228       vec_perm_builder sel (nelt, 1, 3);
6229       sel.quick_grow (3);
6230       for (i = 0; i < 3; ++i)
6231         sel[i] = i * 2;
6232       vec_perm_indices indices (sel, 2, nelt);
6233       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6234
6235       for (i = 0; i < 3; ++i)
6236         sel[i] = i * 2 + 1;
6237       indices.new_vector (sel, 2, nelt);
6238       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6239
6240       for (i = 0; i < log_length; i++)
6241         {
6242           for (j = 0; j < length; j += 2)
6243             {
6244               first_vect = dr_chain[j];
6245               second_vect = dr_chain[j+1];
6246
6247               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6248               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6249               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6250                                                first_vect, second_vect,
6251                                                perm_mask_even);
6252               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6253               (*result_chain)[j/2] = data_ref;
6254
6255               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6256               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6257               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6258                                                first_vect, second_vect,
6259                                                perm_mask_odd);
6260               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6261               (*result_chain)[j/2+length/2] = data_ref;
6262             }
6263           memcpy (dr_chain.address (), result_chain->address (),
6264                   length * sizeof (tree));
6265         }
6266     }
6267 }
6268
6269 /* Function vect_shift_permute_load_chain.
6270
6271    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6272    sequence of stmts to reorder the input data accordingly.
6273    Return the final references for loads in RESULT_CHAIN.
6274    Return true if successed, false otherwise.
6275
6276    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6277    The input is 3 vectors each containing 8 elements.  We assign a
6278    number to each element, the input sequence is:
6279
6280    1st vec:   0  1  2  3  4  5  6  7
6281    2nd vec:   8  9 10 11 12 13 14 15
6282    3rd vec:  16 17 18 19 20 21 22 23
6283
6284    The output sequence should be:
6285
6286    1st vec:  0 3 6  9 12 15 18 21
6287    2nd vec:  1 4 7 10 13 16 19 22
6288    3rd vec:  2 5 8 11 14 17 20 23
6289
6290    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6291
6292    First we shuffle all 3 vectors to get correct elements order:
6293
6294    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6295    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6296    3rd vec:  (16 19 22) (17 20 23) (18 21)
6297
6298    Next we unite and shift vector 3 times:
6299
6300    1st step:
6301      shift right by 6 the concatenation of:
6302      "1st vec" and  "2nd vec"
6303        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6304      "2nd vec" and  "3rd vec"
6305        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6306      "3rd vec" and  "1st vec"
6307        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6308                              | New vectors                   |
6309
6310      So that now new vectors are:
6311
6312      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6313      2nd vec:  (10 13) (16 19 22) (17 20 23)
6314      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6315
6316    2nd step:
6317      shift right by 5 the concatenation of:
6318      "1st vec" and  "3rd vec"
6319        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6320      "2nd vec" and  "1st vec"
6321        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6322      "3rd vec" and  "2nd vec"
6323        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6324                           | New vectors                   |
6325
6326      So that now new vectors are:
6327
6328      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6329      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6330      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6331
6332    3rd step:
6333      shift right by 5 the concatenation of:
6334      "1st vec" and  "1st vec"
6335        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6336      shift right by 3 the concatenation of:
6337      "2nd vec" and  "2nd vec"
6338                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6339                           | New vectors                   |
6340
6341      So that now all vectors are READY:
6342      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6343      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6344      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6345
6346    This algorithm is faster than one in vect_permute_load_chain if:
6347      1.  "shift of a concatination" is faster than general permutation.
6348          This is usually so.
6349      2.  The TARGET machine can't execute vector instructions in parallel.
6350          This is because each step of the algorithm depends on previous.
6351          The algorithm in vect_permute_load_chain is much more parallel.
6352
6353    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6354 */
6355
6356 static bool
6357 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6358                                unsigned int length,
6359                                stmt_vec_info stmt_info,
6360                                gimple_stmt_iterator *gsi,
6361                                vec<tree> *result_chain)
6362 {
6363   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6364   tree perm2_mask1, perm2_mask2, perm3_mask;
6365   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6366   gimple *perm_stmt;
6367
6368   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6369   machine_mode vmode = TYPE_MODE (vectype);
6370   unsigned int i;
6371   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6372
6373   unsigned HOST_WIDE_INT nelt, vf;
6374   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6375       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6376     /* Not supported for variable-length vectors.  */
6377     return false;
6378
6379   vec_perm_builder sel (nelt, nelt, 1);
6380   sel.quick_grow (nelt);
6381
6382   result_chain->quick_grow (length);
6383   memcpy (result_chain->address (), dr_chain.address (),
6384           length * sizeof (tree));
6385
6386   if (pow2p_hwi (length) && vf > 4)
6387     {
6388       unsigned int j, log_length = exact_log2 (length);
6389       for (i = 0; i < nelt / 2; ++i)
6390         sel[i] = i * 2;
6391       for (i = 0; i < nelt / 2; ++i)
6392         sel[nelt / 2 + i] = i * 2 + 1;
6393       vec_perm_indices indices (sel, 2, nelt);
6394       if (!can_vec_perm_const_p (vmode, vmode, indices))
6395         {
6396           if (dump_enabled_p ())
6397             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6398                              "shuffle of 2 fields structure is not \
6399                               supported by target\n");
6400           return false;
6401         }
6402       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6403
6404       for (i = 0; i < nelt / 2; ++i)
6405         sel[i] = i * 2 + 1;
6406       for (i = 0; i < nelt / 2; ++i)
6407         sel[nelt / 2 + i] = i * 2;
6408       indices.new_vector (sel, 2, nelt);
6409       if (!can_vec_perm_const_p (vmode, vmode, indices))
6410         {
6411           if (dump_enabled_p ())
6412             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6413                              "shuffle of 2 fields structure is not \
6414                               supported by target\n");
6415           return false;
6416         }
6417       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6418
6419       /* Generating permutation constant to shift all elements.
6420          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6421       for (i = 0; i < nelt; i++)
6422         sel[i] = nelt / 2 + i;
6423       indices.new_vector (sel, 2, nelt);
6424       if (!can_vec_perm_const_p (vmode, vmode, indices))
6425         {
6426           if (dump_enabled_p ())
6427             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6428                              "shift permutation is not supported by target\n");
6429           return false;
6430         }
6431       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6432
6433       /* Generating permutation constant to select vector from 2.
6434          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6435       for (i = 0; i < nelt / 2; i++)
6436         sel[i] = i;
6437       for (i = nelt / 2; i < nelt; i++)
6438         sel[i] = nelt + i;
6439       indices.new_vector (sel, 2, nelt);
6440       if (!can_vec_perm_const_p (vmode, vmode, indices))
6441         {
6442           if (dump_enabled_p ())
6443             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6444                              "select is not supported by target\n");
6445           return false;
6446         }
6447       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6448
6449       for (i = 0; i < log_length; i++)
6450         {
6451           for (j = 0; j < length; j += 2)
6452             {
6453               first_vect = dr_chain[j];
6454               second_vect = dr_chain[j + 1];
6455
6456               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6457               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6458                                                first_vect, first_vect,
6459                                                perm2_mask1);
6460               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6461               vect[0] = data_ref;
6462
6463               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6464               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6465                                                second_vect, second_vect,
6466                                                perm2_mask2);
6467               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6468               vect[1] = data_ref;
6469
6470               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6471               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6472                                                vect[0], vect[1], shift1_mask);
6473               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6474               (*result_chain)[j/2 + length/2] = data_ref;
6475
6476               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6477               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6478                                                vect[0], vect[1], select_mask);
6479               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6480               (*result_chain)[j/2] = data_ref;
6481             }
6482           memcpy (dr_chain.address (), result_chain->address (),
6483                   length * sizeof (tree));
6484         }
6485       return true;
6486     }
6487   if (length == 3 && vf > 2)
6488     {
6489       unsigned int k = 0, l = 0;
6490
6491       /* Generating permutation constant to get all elements in rigth order.
6492          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6493       for (i = 0; i < nelt; i++)
6494         {
6495           if (3 * k + (l % 3) >= nelt)
6496             {
6497               k = 0;
6498               l += (3 - (nelt % 3));
6499             }
6500           sel[i] = 3 * k + (l % 3);
6501           k++;
6502         }
6503       vec_perm_indices indices (sel, 2, nelt);
6504       if (!can_vec_perm_const_p (vmode, vmode, indices))
6505         {
6506           if (dump_enabled_p ())
6507             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6508                              "shuffle of 3 fields structure is not \
6509                               supported by target\n");
6510           return false;
6511         }
6512       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6513
6514       /* Generating permutation constant to shift all elements.
6515          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6516       for (i = 0; i < nelt; i++)
6517         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6518       indices.new_vector (sel, 2, nelt);
6519       if (!can_vec_perm_const_p (vmode, vmode, indices))
6520         {
6521           if (dump_enabled_p ())
6522             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6523                              "shift permutation is not supported by target\n");
6524           return false;
6525         }
6526       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6527
6528       /* Generating permutation constant to shift all elements.
6529          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6530       for (i = 0; i < nelt; i++)
6531         sel[i] = 2 * (nelt / 3) + 1 + i;
6532       indices.new_vector (sel, 2, nelt);
6533       if (!can_vec_perm_const_p (vmode, vmode, indices))
6534         {
6535           if (dump_enabled_p ())
6536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6537                              "shift permutation is not supported by target\n");
6538           return false;
6539         }
6540       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6541
6542       /* Generating permutation constant to shift all elements.
6543          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6544       for (i = 0; i < nelt; i++)
6545         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6546       indices.new_vector (sel, 2, nelt);
6547       if (!can_vec_perm_const_p (vmode, vmode, indices))
6548         {
6549           if (dump_enabled_p ())
6550             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6551                              "shift permutation is not supported by target\n");
6552           return false;
6553         }
6554       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6555
6556       /* Generating permutation constant to shift all elements.
6557          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6558       for (i = 0; i < nelt; i++)
6559         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6560       indices.new_vector (sel, 2, nelt);
6561       if (!can_vec_perm_const_p (vmode, vmode, indices))
6562         {
6563           if (dump_enabled_p ())
6564             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6565                              "shift permutation is not supported by target\n");
6566           return false;
6567         }
6568       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6569
6570       for (k = 0; k < 3; k++)
6571         {
6572           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6573           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6574                                            dr_chain[k], dr_chain[k],
6575                                            perm3_mask);
6576           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6577           vect[k] = data_ref;
6578         }
6579
6580       for (k = 0; k < 3; k++)
6581         {
6582           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6583           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6584                                            vect[k % 3], vect[(k + 1) % 3],
6585                                            shift1_mask);
6586           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6587           vect_shift[k] = data_ref;
6588         }
6589
6590       for (k = 0; k < 3; k++)
6591         {
6592           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6593           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6594                                            vect_shift[(4 - k) % 3],
6595                                            vect_shift[(3 - k) % 3],
6596                                            shift2_mask);
6597           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6598           vect[k] = data_ref;
6599         }
6600
6601       (*result_chain)[3 - (nelt % 3)] = vect[2];
6602
6603       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6604       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6605                                        vect[0], shift3_mask);
6606       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6607       (*result_chain)[nelt % 3] = data_ref;
6608
6609       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6610       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6611                                        vect[1], shift4_mask);
6612       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6613       (*result_chain)[0] = data_ref;
6614       return true;
6615     }
6616   return false;
6617 }
6618
6619 /* Function vect_transform_grouped_load.
6620
6621    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6622    to perform their permutation and ascribe the result vectorized statements to
6623    the scalar statements.
6624 */
6625
6626 void
6627 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6628                              vec<tree> dr_chain,
6629                              int size, gimple_stmt_iterator *gsi)
6630 {
6631   machine_mode mode;
6632   vec<tree> result_chain = vNULL;
6633
6634   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6635      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6636      vectors, that are ready for vector computation.  */
6637   result_chain.create (size);
6638
6639   /* If reassociation width for vector type is 2 or greater target machine can
6640      execute 2 or more vector instructions in parallel.  Otherwise try to
6641      get chain for loads group using vect_shift_permute_load_chain.  */
6642   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6643   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6644       || pow2p_hwi (size)
6645       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6646                                          gsi, &result_chain))
6647     vect_permute_load_chain (vinfo, dr_chain,
6648                              size, stmt_info, gsi, &result_chain);
6649   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6650   result_chain.release ();
6651 }
6652
6653 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6654    generated as part of the vectorization of STMT_INFO.  Assign the statement
6655    for each vector to the associated scalar statement.  */
6656
6657 void
6658 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6659                                   vec<tree> result_chain)
6660 {
6661   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6662   unsigned int i, gap_count;
6663   tree tmp_data_ref;
6664
6665   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6666      Since we scan the chain starting from it's first node, their order
6667      corresponds the order of data-refs in RESULT_CHAIN.  */
6668   stmt_vec_info next_stmt_info = first_stmt_info;
6669   gap_count = 1;
6670   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6671     {
6672       if (!next_stmt_info)
6673         break;
6674
6675       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6676        code elimination pass later.  No need to check for the first stmt in
6677        the group, since it always exists.
6678        DR_GROUP_GAP is the number of steps in elements from the previous
6679        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6680        correspond to the gaps.  */
6681       if (next_stmt_info != first_stmt_info
6682           && gap_count < DR_GROUP_GAP (next_stmt_info))
6683         {
6684           gap_count++;
6685           continue;
6686         }
6687
6688       /* ???  The following needs cleanup after the removal of
6689          DR_GROUP_SAME_DR_STMT.  */
6690       if (next_stmt_info)
6691         {
6692           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6693           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6694              copies, and we put the new vector statement last.  */
6695           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6696
6697           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6698           gap_count = 1;
6699         }
6700     }
6701 }
6702
6703 /* Function vect_force_dr_alignment_p.
6704
6705    Returns whether the alignment of a DECL can be forced to be aligned
6706    on ALIGNMENT bit boundary.  */
6707
6708 bool
6709 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6710 {
6711   if (!VAR_P (decl))
6712     return false;
6713
6714   if (decl_in_symtab_p (decl)
6715       && !symtab_node::get (decl)->can_increase_alignment_p ())
6716     return false;
6717
6718   if (TREE_STATIC (decl))
6719     return (known_le (alignment,
6720                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6721   else
6722     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6723 }
6724
6725 /* Return whether the data reference DR_INFO is supported with respect to its
6726    alignment.
6727    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6728    it is aligned, i.e., check if it is possible to vectorize it with different
6729    alignment.  */
6730
6731 enum dr_alignment_support
6732 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6733                                tree vectype, int misalignment)
6734 {
6735   data_reference *dr = dr_info->dr;
6736   stmt_vec_info stmt_info = dr_info->stmt;
6737   machine_mode mode = TYPE_MODE (vectype);
6738   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6739   class loop *vect_loop = NULL;
6740   bool nested_in_vect_loop = false;
6741
6742   if (misalignment == 0)
6743     return dr_aligned;
6744
6745   /* For now assume all conditional loads/stores support unaligned
6746      access without any special code.  */
6747   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6748     if (gimple_call_internal_p (stmt)
6749         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6750             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6751       return dr_unaligned_supported;
6752
6753   if (loop_vinfo)
6754     {
6755       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6756       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6757     }
6758
6759   /* Possibly unaligned access.  */
6760
6761   /* We can choose between using the implicit realignment scheme (generating
6762      a misaligned_move stmt) and the explicit realignment scheme (generating
6763      aligned loads with a REALIGN_LOAD).  There are two variants to the
6764      explicit realignment scheme: optimized, and unoptimized.
6765      We can optimize the realignment only if the step between consecutive
6766      vector loads is equal to the vector size.  Since the vector memory
6767      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6768      is guaranteed that the misalignment amount remains the same throughout the
6769      execution of the vectorized loop.  Therefore, we can create the
6770      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6771      at the loop preheader.
6772
6773      However, in the case of outer-loop vectorization, when vectorizing a
6774      memory access in the inner-loop nested within the LOOP that is now being
6775      vectorized, while it is guaranteed that the misalignment of the
6776      vectorized memory access will remain the same in different outer-loop
6777      iterations, it is *not* guaranteed that is will remain the same throughout
6778      the execution of the inner-loop.  This is because the inner-loop advances
6779      with the original scalar step (and not in steps of VS).  If the inner-loop
6780      step happens to be a multiple of VS, then the misalignment remains fixed
6781      and we can use the optimized realignment scheme.  For example:
6782
6783       for (i=0; i<N; i++)
6784         for (j=0; j<M; j++)
6785           s += a[i+j];
6786
6787      When vectorizing the i-loop in the above example, the step between
6788      consecutive vector loads is 1, and so the misalignment does not remain
6789      fixed across the execution of the inner-loop, and the realignment cannot
6790      be optimized (as illustrated in the following pseudo vectorized loop):
6791
6792       for (i=0; i<N; i+=4)
6793         for (j=0; j<M; j++){
6794           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6795                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6796                          // (assuming that we start from an aligned address).
6797           }
6798
6799      We therefore have to use the unoptimized realignment scheme:
6800
6801       for (i=0; i<N; i+=4)
6802           for (j=k; j<M; j+=4)
6803           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6804                            // that the misalignment of the initial address is
6805                            // 0).
6806
6807      The loop can then be vectorized as follows:
6808
6809       for (k=0; k<4; k++){
6810         rt = get_realignment_token (&vp[k]);
6811         for (i=0; i<N; i+=4){
6812           v1 = vp[i+k];
6813           for (j=k; j<M; j+=4){
6814             v2 = vp[i+j+VS-1];
6815             va = REALIGN_LOAD <v1,v2,rt>;
6816             vs += va;
6817             v1 = v2;
6818           }
6819         }
6820     } */
6821
6822   if (DR_IS_READ (dr))
6823     {
6824       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6825           && (!targetm.vectorize.builtin_mask_for_load
6826               || targetm.vectorize.builtin_mask_for_load ()))
6827         {
6828           /* If we are doing SLP then the accesses need not have the
6829              same alignment, instead it depends on the SLP group size.  */
6830           if (loop_vinfo
6831               && STMT_SLP_TYPE (stmt_info)
6832               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
6833                   || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6834                                   * (DR_GROUP_SIZE
6835                                        (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6836                                   TYPE_VECTOR_SUBPARTS (vectype))))
6837             ;
6838           else if (!loop_vinfo
6839                    || (nested_in_vect_loop
6840                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6841                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6842             return dr_explicit_realign;
6843           else
6844             return dr_explicit_realign_optimized;
6845         }
6846     }
6847
6848   bool is_packed = false;
6849   tree type = TREE_TYPE (DR_REF (dr));
6850   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6851     is_packed = not_size_aligned (DR_REF (dr));
6852   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6853                                                      is_packed))
6854     return dr_unaligned_supported;
6855
6856   /* Unsupported.  */
6857   return dr_unaligned_unsupported;
6858 }