gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 120 {
 121   HOST_WIDE_INT lhs, rhs;
 122
 123   /* During the analysis phase, this function is called on arbitrary
 124      statements that might not have scalar results.  */
 125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 126     return scalar_type;
 127
 128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 129
 130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 131   if (assign)
 132     {
 133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 134       if (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
 140           || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
 141           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 142         {
 143           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 144
 145           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 146           if (rhs < lhs)
 147             scalar_type = rhs_type;
 148         }
 149     }
 150   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 151     {
 152       unsigned int i = 0;
 153       if (gimple_call_internal_p (call))
 154         {
 155           internal_fn ifn = gimple_call_internal_fn (call);
 156           if (internal_load_fn_p (ifn))
 157             /* For loads the LHS type does the trick.  */
 158             i = ~0U;
 159           else if (internal_store_fn_p (ifn))
 160             {
 161               /* For stores use the tyep of the stored value.  */
 162               i = internal_fn_stored_value_index (ifn);
 163               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 164               i = ~0U;
 165             }
 166           else if (internal_fn_mask_index (ifn) == 0)
 167             i = 1;
 168         }
 169       if (i < gimple_call_num_args (call))
 170         {
 171           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 172           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 173             {
 174               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 175               if (rhs < lhs)
 176                 scalar_type = rhs_type;
 177             }
 178         }
 179     }
 180
 181   return scalar_type;
 182 }
 183
 184
 185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 186    tested at run-time.  Return TRUE if DDR was successfully inserted.
 187    Return false if versioning is not supported.  */
 188
 189 static opt_result
 190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 191 {
 192   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 193
 194   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 195     return opt_result::failure_at (vect_location,
 196                                    "will not create alias checks, as"
 197                                    " --param vect-max-version-for-alias-checks"
 198                                    " == 0\n");
 199
 200   opt_result res
 201     = runtime_alias_check_p (ddr, loop,
 202                              optimize_loop_nest_for_speed_p (loop));
 203   if (!res)
 204     return res;
 205
 206   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 207   return opt_result::success ();
 208 }
 209
 210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 211
 212 static void
 213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 214 {
 215   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 216   for (unsigned int i = 0; i < checks.length(); ++i)
 217     if (checks[i] == value)
 218       return;
 219
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location,
 222                      "need run-time check that %T is nonzero\n",
 223                      value);
 224   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 225 }
 226
 227 /* Return true if we know that the order of vectorized DR_INFO_A and
 228    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 229    DR_INFO_B.  At least one of the accesses is a write.  */
 230
 231 static bool
 232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 233 {
 234   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 235   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 236
 237   /* Single statements are always kept in their original order.  */
 238   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 239       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 240     return true;
 241
 242   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 243      emitted at the position of the first scalar load.
 244      Stores in a group are emitted at the position of the last scalar store.
 245      Compute that position and check whether the resulting order matches
 246      the current one.  */
 247   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 248   if (il_a)
 249     {
 250       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 251         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 252              s = DR_GROUP_NEXT_ELEMENT (s))
 253           il_a = get_later_stmt (il_a, s);
 254       else /* DR_IS_READ */
 255         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 256              s = DR_GROUP_NEXT_ELEMENT (s))
 257           if (get_later_stmt (il_a, s) == il_a)
 258             il_a = s;
 259     }
 260   else
 261     il_a = stmtinfo_a;
 262   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 263   if (il_b)
 264     {
 265       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 266         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 267              s = DR_GROUP_NEXT_ELEMENT (s))
 268           il_b = get_later_stmt (il_b, s);
 269       else /* DR_IS_READ */
 270         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 271              s = DR_GROUP_NEXT_ELEMENT (s))
 272           if (get_later_stmt (il_b, s) == il_b)
 273             il_b = s;
 274     }
 275   else
 276     il_b = stmtinfo_b;
 277   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 278   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 279 }
 280
 281 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 282    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 283    distances.  These distances are conservatively correct but they don't
 284    reflect a guaranteed dependence.
 285
 286    Return true if this function does all the work necessary to avoid
 287    an alias or false if the caller should use the dependence distances
 288    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 289    the depth of the loop described by LOOP_VINFO and the other arguments
 290    are as for vect_analyze_data_ref_dependence.  */
 291
 292 static bool
 293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 294                                        loop_vec_info loop_vinfo,
 295                                        int loop_depth, unsigned int *max_vf)
 296 {
 297   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 298   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 299     {
 300       int dist = dist_v[loop_depth];
 301       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 302         {
 303           /* If the user asserted safelen >= DIST consecutive iterations
 304              can be executed concurrently, assume independence.
 305
 306              ??? An alternative would be to add the alias check even
 307              in this case, and vectorize the fallback loop with the
 308              maximum VF set to safelen.  However, if the user has
 309              explicitly given a length, it's less likely that that
 310              would be a win.  */
 311           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 312             {
 313               if ((unsigned int) loop->safelen < *max_vf)
 314                 *max_vf = loop->safelen;
 315               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 316               continue;
 317             }
 318
 319           /* For dependence distances of 2 or more, we have the option
 320              of limiting VF or checking for an alias at runtime.
 321              Prefer to check at runtime if we can, to avoid limiting
 322              the VF unnecessarily when the bases are in fact independent.
 323
 324              Note that the alias checks will be removed if the VF ends up
 325              being small enough.  */
 326           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 327           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 328           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 329                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 330                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 331         }
 332     }
 333   return true;
 334 }
 335
 336
 337 /* Function vect_analyze_data_ref_dependence.
 338
 339    FIXME: I needed to change the sense of the returned flag.
 340
 341    Return FALSE if there (might) exist a dependence between a memory-reference
 342    DRA and a memory-reference DRB.  When versioning for alias may check a
 343    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 344    the data dependence.  */
 345
 346 static opt_result
 347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 348                                   loop_vec_info loop_vinfo,
 349                                   unsigned int *max_vf)
 350 {
 351   unsigned int i;
 352   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 353   struct data_reference *dra = DDR_A (ddr);
 354   struct data_reference *drb = DDR_B (ddr);
 355   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 356   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 357   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 358   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 359   lambda_vector dist_v;
 360   unsigned int loop_depth;
 361
 362   /* If user asserted safelen consecutive iterations can be
 363      executed concurrently, assume independence.  */
 364   auto apply_safelen = [&]()
 365     {
 366       if (loop->safelen >= 2)
 367         {
 368           if ((unsigned int) loop->safelen < *max_vf)
 369             *max_vf = loop->safelen;
 370           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 371           return true;
 372         }
 373       return false;
 374     };
 375
 376   /* In loop analysis all data references should be vectorizable.  */
 377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 379     gcc_unreachable ();
 380
 381   /* Independent data accesses.  */
 382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 383     return opt_result::success ();
 384
 385   if (dra == drb
 386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 387     return opt_result::success ();
 388
 389   /* We do not have to consider dependences between accesses that belong
 390      to the same group, unless the stride could be smaller than the
 391      group size.  */
 392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 394           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 396     return opt_result::success ();
 397
 398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 399      least two scalar iterations, there is always also a true dependence.
 400      As the vectorizer does not re-order loads and stores we can ignore
 401      the anti-dependence if TBAA can disambiguate both DRs similar to the
 402      case with known negative distance anti-dependences (positive
 403      distance anti-dependences would violate TBAA constraints).  */
 404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 407                                  get_alias_set (DR_REF (drb))))
 408     return opt_result::success ();
 409
 410   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 411       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 412     {
 413       if (apply_safelen ())
 414         return opt_result::success ();
 415
 416       return opt_result::failure_at
 417         (stmtinfo_a->stmt,
 418          "possible alias involving gather/scatter between %T and %T\n",
 419          DR_REF (dra), DR_REF (drb));
 420     }
 421
 422   /* Unknown data dependence.  */
 423   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 424     {
 425       if (apply_safelen ())
 426         return opt_result::success ();
 427
 428       if (dump_enabled_p ())
 429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 430                          "versioning for alias required: "
 431                          "can't determine dependence between %T and %T\n",
 432                          DR_REF (dra), DR_REF (drb));
 433
 434       /* Add to list of ddrs that need to be tested at run-time.  */
 435       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 436     }
 437
 438   /* Known data dependence.  */
 439   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 440     {
 441       if (apply_safelen ())
 442         return opt_result::success ();
 443
 444       if (dump_enabled_p ())
 445         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 446                          "versioning for alias required: "
 447                          "bad dist vector for %T and %T\n",
 448                          DR_REF (dra), DR_REF (drb));
 449       /* Add to list of ddrs that need to be tested at run-time.  */
 450       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 451     }
 452
 453   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 454
 455   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 456       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 457                                                 loop_depth, max_vf))
 458     return opt_result::success ();
 459
 460   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 461     {
 462       int dist = dist_v[loop_depth];
 463
 464       if (dump_enabled_p ())
 465         dump_printf_loc (MSG_NOTE, vect_location,
 466                          "dependence distance  = %d.\n", dist);
 467
 468       if (dist == 0)
 469         {
 470           if (dump_enabled_p ())
 471             dump_printf_loc (MSG_NOTE, vect_location,
 472                              "dependence distance == 0 between %T and %T\n",
 473                              DR_REF (dra), DR_REF (drb));
 474
 475           /* When we perform grouped accesses and perform implicit CSE
 476              by detecting equal accesses and doing disambiguation with
 477              runtime alias tests like for
 478                 .. = a[i];
 479                 .. = a[i+1];
 480                 a[i] = ..;
 481                 a[i+1] = ..;
 482                 *p = ..;
 483                 .. = a[i];
 484                 .. = a[i+1];
 485              where we will end up loading { a[i], a[i+1] } once, make
 486              sure that inserting group loads before the first load and
 487              stores after the last store will do the right thing.
 488              Similar for groups like
 489                 a[i] = ...;
 490                 ... = a[i];
 491                 a[i+1] = ...;
 492              where loads from the group interleave with the store.  */
 493           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 494             return opt_result::failure_at (stmtinfo_a->stmt,
 495                                            "READ_WRITE dependence"
 496                                            " in interleaving.\n");
 497
 498           if (loop->safelen < 2)
 499             {
 500               tree indicator = dr_zero_step_indicator (dra);
 501               if (!indicator || integer_zerop (indicator))
 502                 return opt_result::failure_at (stmtinfo_a->stmt,
 503                                                "access also has a zero step\n");
 504               else if (TREE_CODE (indicator) != INTEGER_CST)
 505                 vect_check_nonzero_value (loop_vinfo, indicator);
 506             }
 507           continue;
 508         }
 509
 510       if (dist > 0 && DDR_REVERSED_P (ddr))
 511         {
 512           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 513              reversed (to make distance vector positive), and the actual
 514              distance is negative.  */
 515           if (dump_enabled_p ())
 516             dump_printf_loc (MSG_NOTE, vect_location,
 517                              "dependence distance negative.\n");
 518           /* When doing outer loop vectorization, we need to check if there is
 519              a backward dependence at the inner loop level if the dependence
 520              at the outer loop is reversed.  See PR81740.  */
 521           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 522               || nested_in_vect_loop_p (loop, stmtinfo_b))
 523             {
 524               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 525                                                          DDR_LOOP_NEST (ddr));
 526               if (dist_v[inner_depth] < 0)
 527                 return opt_result::failure_at (stmtinfo_a->stmt,
 528                                                "not vectorized, dependence "
 529                                                "between data-refs %T and %T\n",
 530                                                DR_REF (dra), DR_REF (drb));
 531             }
 532           /* Record a negative dependence distance to later limit the
 533              amount of stmt copying / unrolling we can perform.
 534              Only need to handle read-after-write dependence.  */
 535           if (DR_IS_READ (drb)
 536               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 537                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 538             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 539           continue;
 540         }
 541
 542       unsigned int abs_dist = abs (dist);
 543       if (abs_dist >= 2 && abs_dist < *max_vf)
 544         {
 545           /* The dependence distance requires reduction of the maximal
 546              vectorization factor.  */
 547           *max_vf = abs_dist;
 548           if (dump_enabled_p ())
 549             dump_printf_loc (MSG_NOTE, vect_location,
 550                              "adjusting maximal vectorization factor to %i\n",
 551                              *max_vf);
 552         }
 553
 554       if (abs_dist >= *max_vf)
 555         {
 556           /* Dependence distance does not create dependence, as far as
 557              vectorization is concerned, in this case.  */
 558           if (dump_enabled_p ())
 559             dump_printf_loc (MSG_NOTE, vect_location,
 560                              "dependence distance >= VF.\n");
 561           continue;
 562         }
 563
 564       return opt_result::failure_at (stmtinfo_a->stmt,
 565                                      "not vectorized, possible dependence "
 566                                      "between data-refs %T and %T\n",
 567                                      DR_REF (dra), DR_REF (drb));
 568     }
 569
 570   return opt_result::success ();
 571 }
 572
 573 /* Function vect_analyze_data_ref_dependences.
 574
 575    Examine all the data references in the loop, and make sure there do not
 576    exist any data dependences between them.  Set *MAX_VF according to
 577    the maximum vectorization factor the data dependences allow.  */
 578
 579 opt_result
 580 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 581                                    unsigned int *max_vf)
 582 {
 583   unsigned int i;
 584   struct data_dependence_relation *ddr;
 585
 586   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 587
 588   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 589     {
 590       LOOP_VINFO_DDRS (loop_vinfo)
 591         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 592                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 593       /* We do not need read-read dependences.  */
 594       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 595                                           &LOOP_VINFO_DDRS (loop_vinfo),
 596                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 597                                           false);
 598       gcc_assert (res);
 599     }
 600
 601   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 602
 603   /* For epilogues we either have no aliases or alias versioning
 604      was applied to original loop.  Therefore we may just get max_vf
 605      using VF of original loop.  */
 606   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 607     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 608   else
 609     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 610       {
 611         opt_result res
 612           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 613         if (!res)
 614           return res;
 615       }
 616
 617   return opt_result::success ();
 618 }
 619
 620
 621 /* Function vect_slp_analyze_data_ref_dependence.
 622
 623    Return TRUE if there (might) exist a dependence between a memory-reference
 624    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 625    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 626    according to the data dependence.  */
 627
 628 static bool
 629 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 630                                       struct data_dependence_relation *ddr)
 631 {
 632   struct data_reference *dra = DDR_A (ddr);
 633   struct data_reference *drb = DDR_B (ddr);
 634   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 635   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 636
 637   /* We need to check dependences of statements marked as unvectorizable
 638      as well, they still can prohibit vectorization.  */
 639
 640   /* Independent data accesses.  */
 641   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 642     return false;
 643
 644   if (dra == drb)
 645     return false;
 646
 647   /* Read-read is OK.  */
 648   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 649     return false;
 650
 651   /* If dra and drb are part of the same interleaving chain consider
 652      them independent.  */
 653   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 654       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 655           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 656     return false;
 657
 658   /* Unknown data dependence.  */
 659   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 660     {
 661       if  (dump_enabled_p ())
 662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 663                          "can't determine dependence between %T and %T\n",
 664                          DR_REF (dra), DR_REF (drb));
 665     }
 666   else if (dump_enabled_p ())
 667     dump_printf_loc (MSG_NOTE, vect_location,
 668                      "determined dependence between %T and %T\n",
 669                      DR_REF (dra), DR_REF (drb));
 670
 671   return true;
 672 }
 673
 674
 675 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 676    contain the vector of scalar stores of this instance if we are
 677    disambiguating the loads.  */
 678
 679 static bool
 680 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 681                                    vec<stmt_vec_info> stores,
 682                                    stmt_vec_info last_store_info)
 683 {
 684   /* This walks over all stmts involved in the SLP load/store done
 685      in NODE verifying we can sink them up to the last stmt in the
 686      group.  */
 687   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
 688     {
 689       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 690       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 691         {
 692           stmt_vec_info access_info
 693             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 694           if (access_info == last_access_info)
 695             continue;
 696           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 697           ao_ref ref;
 698           bool ref_initialized_p = false;
 699           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 700                gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 701             {
 702               gimple *stmt = gsi_stmt (gsi);
 703               if (! gimple_vuse (stmt))
 704                 continue;
 705
 706               /* If we couldn't record a (single) data reference for this
 707                  stmt we have to resort to the alias oracle.  */
 708               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 709               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 710               if (!dr_b)
 711                 {
 712                   /* We are moving a store - this means
 713                      we cannot use TBAA for disambiguation.  */
 714                   if (!ref_initialized_p)
 715                     ao_ref_init (&ref, DR_REF (dr_a));
 716                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 717                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 718                     return false;
 719                   continue;
 720                 }
 721
 722               bool dependent = false;
 723               /* If we run into a store of this same instance (we've just
 724                  marked those) then delay dependence checking until we run
 725                  into the last store because this is where it will have
 726                  been sunk to (and we verify if we can do that as well).  */
 727               if (gimple_visited_p (stmt))
 728                 {
 729                   if (stmt_info != last_store_info)
 730                     continue;
 731
 732                   for (stmt_vec_info &store_info : stores)
 733                     {
 734                       data_reference *store_dr
 735                         = STMT_VINFO_DATA_REF (store_info);
 736                       ddr_p ddr = initialize_data_dependence_relation
 737                                     (dr_a, store_dr, vNULL);
 738                       dependent
 739                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 740                       free_dependence_relation (ddr);
 741                       if (dependent)
 742                         break;
 743                     }
 744                 }
 745               else
 746                 {
 747                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 748                                                                    dr_b, vNULL);
 749                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 750                   free_dependence_relation (ddr);
 751                 }
 752               if (dependent)
 753                 return false;
 754             }
 755         }
 756     }
 757   else /* DR_IS_READ */
 758     {
 759       stmt_vec_info first_access_info
 760         = vect_find_first_scalar_stmt_in_slp (node);
 761       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 762         {
 763           stmt_vec_info access_info
 764             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 765           if (access_info == first_access_info)
 766             continue;
 767           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 768           ao_ref ref;
 769           bool ref_initialized_p = false;
 770           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 771                gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 772             {
 773               gimple *stmt = gsi_stmt (gsi);
 774               if (! gimple_vdef (stmt))
 775                 continue;
 776
 777               /* If we couldn't record a (single) data reference for this
 778                  stmt we have to resort to the alias oracle.  */
 779               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 780               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 781
 782               /* We are hoisting a load - this means we can use
 783                  TBAA for disambiguation.  */
 784               if (!ref_initialized_p)
 785                 ao_ref_init (&ref, DR_REF (dr_a));
 786               if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
 787                 {
 788                   if (!dr_b)
 789                     return false;
 790                   /* Resort to dependence checking below.  */
 791                 }
 792               else
 793                 /* No dependence.  */
 794                 continue;
 795
 796               bool dependent = false;
 797               /* If we run into a store of this same instance (we've just
 798                  marked those) then delay dependence checking until we run
 799                  into the last store because this is where it will have
 800                  been sunk to (and we verify if we can do that as well).  */
 801               if (gimple_visited_p (stmt))
 802                 {
 803                   if (stmt_info != last_store_info)
 804                     continue;
 805
 806                   for (stmt_vec_info &store_info : stores)
 807                     {
 808                       data_reference *store_dr
 809                         = STMT_VINFO_DATA_REF (store_info);
 810                       ddr_p ddr = initialize_data_dependence_relation
 811                                     (dr_a, store_dr, vNULL);
 812                       dependent
 813                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 814                       free_dependence_relation (ddr);
 815                       if (dependent)
 816                         break;
 817                     }
 818                 }
 819               else
 820                 {
 821                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 822                                                                    dr_b, vNULL);
 823                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 824                   free_dependence_relation (ddr);
 825                 }
 826               if (dependent)
 827                 return false;
 828             }
 829         }
 830     }
 831   return true;
 832 }
 833
 834
 835 /* Function vect_analyze_data_ref_dependences.
 836
 837    Examine all the data references in the basic-block, and make sure there
 838    do not exist any data dependences between them.  Set *MAX_VF according to
 839    the maximum vectorization factor the data dependences allow.  */
 840
 841 bool
 842 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 843 {
 844   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 845
 846   /* The stores of this instance are at the root of the SLP tree.  */
 847   slp_tree store = NULL;
 848   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
 849     store = SLP_INSTANCE_TREE (instance);
 850
 851   /* Verify we can sink stores to the vectorized stmt insert location.  */
 852   stmt_vec_info last_store_info = NULL;
 853   if (store)
 854     {
 855       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 856         return false;
 857
 858       /* Mark stores in this instance and remember the last one.  */
 859       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 860       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 861         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 862     }
 863
 864   bool res = true;
 865
 866   /* Verify we can sink loads to the vectorized stmt insert location,
 867      special-casing stores of this instance.  */
 868   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
 869     if (! vect_slp_analyze_node_dependences (vinfo, load,
 870                                              store
 871                                              ? SLP_TREE_SCALAR_STMTS (store)
 872                                              : vNULL, last_store_info))
 873       {
 874         res = false;
 875         break;
 876       }
 877
 878   /* Unset the visited flag.  */
 879   if (store)
 880     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 881       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 882
 883   return res;
 884 }
 885
 886 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
 887    applied.  */
 888
 889 int
 890 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
 891 {
 892   HOST_WIDE_INT diff = 0;
 893   /* Alignment is only analyzed for the first element of a DR group,
 894      use that but adjust misalignment by the offset of the access.  */
 895   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
 896     {
 897       dr_vec_info *first_dr
 898         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
 899       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
 900          INTEGER_CSTs and the first element in the group has the lowest
 901          address.  */
 902       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
 903               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
 904       gcc_assert (diff >= 0);
 905       dr_info = first_dr;
 906     }
 907
 908   int misalign = dr_info->misalignment;
 909   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
 910   if (misalign == DR_MISALIGNMENT_UNKNOWN)
 911     return misalign;
 912
 913   /* If the access is only aligned for a vector type with smaller alignment
 914      requirement the access has unknown misalignment.  */
 915   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
 916                 targetm.vectorize.preferred_vector_alignment (vectype)))
 917     return DR_MISALIGNMENT_UNKNOWN;
 918
 919   /* Apply the offset from the DR group start and the externally supplied
 920      offset which can for example result from a negative stride access.  */
 921   poly_int64 misalignment = misalign + diff + offset;
 922
 923   /* vect_compute_data_ref_alignment will have ensured that target_alignment
 924      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
 925   unsigned HOST_WIDE_INT target_alignment_c
 926     = dr_info->target_alignment.to_constant ();
 927   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
 928     return DR_MISALIGNMENT_UNKNOWN;
 929   return misalign;
 930 }
 931
 932 /* Record the base alignment guarantee given by DRB, which occurs
 933    in STMT_INFO.  */
 934
 935 static void
 936 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 937                             innermost_loop_behavior *drb)
 938 {
 939   bool existed;
 940   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
 941     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 942   if (!existed || entry.second->base_alignment < drb->base_alignment)
 943     {
 944       entry = std::make_pair (stmt_info, drb);
 945       if (dump_enabled_p ())
 946         dump_printf_loc (MSG_NOTE, vect_location,
 947                          "recording new base alignment for %T\n"
 948                          "  alignment:    %d\n"
 949                          "  misalignment: %d\n"
 950                          "  based on:     %G",
 951                          drb->base_address,
 952                          drb->base_alignment,
 953                          drb->base_misalignment,
 954                          stmt_info->stmt);
 955     }
 956 }
 957
 958 /* If the region we're going to vectorize is reached, all unconditional
 959    data references occur at least once.  We can therefore pool the base
 960    alignment guarantees from each unconditional reference.  Do this by
 961    going through all the data references in VINFO and checking whether
 962    the containing statement makes the reference unconditionally.  If so,
 963    record the alignment of the base address in VINFO so that it can be
 964    used for all other references with the same base.  */
 965
 966 void
 967 vect_record_base_alignments (vec_info *vinfo)
 968 {
 969   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 970   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 971   for (data_reference *dr : vinfo->shared->datarefs)
 972     {
 973       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 974       stmt_vec_info stmt_info = dr_info->stmt;
 975       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 976           && STMT_VINFO_VECTORIZABLE (stmt_info)
 977           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 978         {
 979           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 980
 981           /* If DR is nested in the loop that is being vectorized, we can also
 982              record the alignment of the base wrt the outer loop.  */
 983           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 984             vect_record_base_alignment
 985               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 986         }
 987     }
 988 }
 989
 990 /* Function vect_compute_data_ref_alignment
 991
 992    Compute the misalignment of the data reference DR_INFO when vectorizing
 993    with VECTYPE.
 994
 995    Output:
 996    1. initialized misalignment info for DR_INFO
 997
 998    FOR NOW: No analysis is actually performed. Misalignment is calculated
 999    only for trivial cases. TODO.  */
1000
1001 static void
1002 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1003                                  tree vectype)
1004 {
1005   stmt_vec_info stmt_info = dr_info->stmt;
1006   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1007   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1008   class loop *loop = NULL;
1009   tree ref = DR_REF (dr_info->dr);
1010
1011   if (dump_enabled_p ())
1012     dump_printf_loc (MSG_NOTE, vect_location,
1013                      "vect_compute_data_ref_alignment:\n");
1014
1015   if (loop_vinfo)
1016     loop = LOOP_VINFO_LOOP (loop_vinfo);
1017
1018   /* Initialize misalignment to unknown.  */
1019   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1020
1021   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1022     return;
1023
1024   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1025   bool step_preserves_misalignment_p;
1026
1027   poly_uint64 vector_alignment
1028     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1029                  BITS_PER_UNIT);
1030   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1031
1032   /* If the main loop has peeled for alignment we have no way of knowing
1033      whether the data accesses in the epilogues are aligned.  We can't at
1034      compile time answer the question whether we have entered the main loop or
1035      not.  Fixes PR 92351.  */
1036   if (loop_vinfo)
1037     {
1038       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1039       if (orig_loop_vinfo
1040           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1041         return;
1042     }
1043
1044   unsigned HOST_WIDE_INT vect_align_c;
1045   if (!vector_alignment.is_constant (&vect_align_c))
1046     return;
1047
1048   /* No step for BB vectorization.  */
1049   if (!loop)
1050     {
1051       gcc_assert (integer_zerop (drb->step));
1052       step_preserves_misalignment_p = true;
1053     }
1054
1055   /* In case the dataref is in an inner-loop of the loop that is being
1056      vectorized (LOOP), we use the base and misalignment information
1057      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1058      stays the same throughout the execution of the inner-loop, which is why
1059      we have to check that the stride of the dataref in the inner-loop evenly
1060      divides by the vector alignment.  */
1061   else if (nested_in_vect_loop_p (loop, stmt_info))
1062     {
1063       step_preserves_misalignment_p
1064         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1065
1066       if (dump_enabled_p ())
1067         {
1068           if (step_preserves_misalignment_p)
1069             dump_printf_loc (MSG_NOTE, vect_location,
1070                              "inner step divides the vector alignment.\n");
1071           else
1072             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1073                              "inner step doesn't divide the vector"
1074                              " alignment.\n");
1075         }
1076     }
1077
1078   /* Similarly we can only use base and misalignment information relative to
1079      an innermost loop if the misalignment stays the same throughout the
1080      execution of the loop.  As above, this is the case if the stride of
1081      the dataref evenly divides by the alignment.  */
1082   else
1083     {
1084       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1085       step_preserves_misalignment_p
1086         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1087
1088       if (!step_preserves_misalignment_p && dump_enabled_p ())
1089         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1090                          "step doesn't divide the vector alignment.\n");
1091     }
1092
1093   unsigned int base_alignment = drb->base_alignment;
1094   unsigned int base_misalignment = drb->base_misalignment;
1095
1096   /* Calculate the maximum of the pooled base address alignment and the
1097      alignment that we can compute for DR itself.  */
1098   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1099     = base_alignments->get (drb->base_address);
1100   if (entry
1101       && base_alignment < (*entry).second->base_alignment
1102       && (loop_vinfo
1103           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1104                               gimple_bb (entry->first->stmt))
1105               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1106                   || (entry->first->dr_aux.group <= dr_info->group)))))
1107     {
1108       base_alignment = entry->second->base_alignment;
1109       base_misalignment = entry->second->base_misalignment;
1110     }
1111
1112   if (drb->offset_alignment < vect_align_c
1113       || !step_preserves_misalignment_p
1114       /* We need to know whether the step wrt the vectorized loop is
1115          negative when computing the starting misalignment below.  */
1116       || TREE_CODE (drb->step) != INTEGER_CST)
1117     {
1118       if (dump_enabled_p ())
1119         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1120                          "Unknown alignment for access: %T\n", ref);
1121       return;
1122     }
1123
1124   if (base_alignment < vect_align_c)
1125     {
1126       unsigned int max_alignment;
1127       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1128       if (max_alignment < vect_align_c
1129           || !vect_can_force_dr_alignment_p (base,
1130                                              vect_align_c * BITS_PER_UNIT))
1131         {
1132           if (dump_enabled_p ())
1133             dump_printf_loc (MSG_NOTE, vect_location,
1134                              "can't force alignment of ref: %T\n", ref);
1135           return;
1136         }
1137
1138       /* Force the alignment of the decl.
1139          NOTE: This is the only change to the code we make during
1140          the analysis phase, before deciding to vectorize the loop.  */
1141       if (dump_enabled_p ())
1142         dump_printf_loc (MSG_NOTE, vect_location,
1143                          "force alignment of %T\n", ref);
1144
1145       dr_info->base_decl = base;
1146       dr_info->base_misaligned = true;
1147       base_misalignment = 0;
1148     }
1149   poly_int64 misalignment
1150     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1151
1152   unsigned int const_misalignment;
1153   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1154     {
1155       if (dump_enabled_p ())
1156         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157                          "Non-constant misalignment for access: %T\n", ref);
1158       return;
1159     }
1160
1161   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1162
1163   if (dump_enabled_p ())
1164     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165                      "misalign = %d bytes of ref %T\n",
1166                      const_misalignment, ref);
1167
1168   return;
1169 }
1170
1171 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1172    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1173    is made aligned via peeling.  */
1174
1175 static bool
1176 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1177                                          dr_vec_info *dr_peel_info)
1178 {
1179   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1180                   DR_TARGET_ALIGNMENT (dr_info)))
1181     {
1182       poly_offset_int diff
1183         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1184            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1185       if (known_eq (diff, 0)
1186           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1187         return true;
1188     }
1189   return false;
1190 }
1191
1192 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1193    aligned via peeling.  */
1194
1195 static bool
1196 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1197                                  dr_vec_info *dr_peel_info)
1198 {
1199   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1200                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1201       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1202                            DR_OFFSET (dr_peel_info->dr), 0)
1203       || !operand_equal_p (DR_STEP (dr_info->dr),
1204                            DR_STEP (dr_peel_info->dr), 0))
1205     return false;
1206
1207   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1208 }
1209
1210 /* Compute the value for dr_info->misalign so that the access appears
1211    aligned.  This is used by peeling to compensate for dr_misalignment
1212    applying the offset for negative step.  */
1213
1214 int
1215 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1216 {
1217   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1218     return 0;
1219
1220   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1221   poly_int64 misalignment
1222     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1223        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1224
1225   unsigned HOST_WIDE_INT target_alignment_c;
1226   int misalign;
1227   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1228       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1229     return DR_MISALIGNMENT_UNKNOWN;
1230   return misalign;
1231 }
1232
1233 /* Function vect_update_misalignment_for_peel.
1234    Sets DR_INFO's misalignment
1235    - to 0 if it has the same alignment as DR_PEEL_INFO,
1236    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1237    - to -1 (unknown) otherwise.
1238
1239    DR_INFO - the data reference whose misalignment is to be adjusted.
1240    DR_PEEL_INFO - the data reference whose misalignment is being made
1241                   zero in the vector loop by the peel.
1242    NPEEL - the number of iterations in the peel loop if the misalignment
1243            of DR_PEEL_INFO is known at compile time.  */
1244
1245 static void
1246 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1247                                    dr_vec_info *dr_peel_info, int npeel)
1248 {
1249   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1250   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1251     {
1252       SET_DR_MISALIGNMENT (dr_info,
1253                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1254       return;
1255     }
1256
1257   unsigned HOST_WIDE_INT alignment;
1258   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1259       && known_alignment_for_access_p (dr_info,
1260                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1261       && known_alignment_for_access_p (dr_peel_info,
1262                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1263     {
1264       int misal = dr_info->misalignment;
1265       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1266       misal &= alignment - 1;
1267       set_dr_misalignment (dr_info, misal);
1268       return;
1269     }
1270
1271   if (dump_enabled_p ())
1272     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1273                      "to unknown (-1).\n");
1274   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1275 }
1276
1277 /* Return true if alignment is relevant for DR_INFO.  */
1278
1279 static bool
1280 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1281 {
1282   stmt_vec_info stmt_info = dr_info->stmt;
1283
1284   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1285     return false;
1286
1287   /* For interleaving, only the alignment of the first access matters.  */
1288   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1289       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1290     return false;
1291
1292   /* Scatter-gather and invariant accesses continue to address individual
1293      scalars, so vector-level alignment is irrelevant.  */
1294   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1295       || integer_zerop (DR_STEP (dr_info->dr)))
1296     return false;
1297
1298   /* Strided accesses perform only component accesses, alignment is
1299      irrelevant for them.  */
1300   if (STMT_VINFO_STRIDED_P (stmt_info)
1301       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1302     return false;
1303
1304   return true;
1305 }
1306
1307 /* Given an memory reference EXP return whether its alignment is less
1308    than its size.  */
1309
1310 static bool
1311 not_size_aligned (tree exp)
1312 {
1313   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1314     return true;
1315
1316   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1317           > get_object_alignment (exp));
1318 }
1319
1320 /* Function vector_alignment_reachable_p
1321
1322    Return true if vector alignment for DR_INFO is reachable by peeling
1323    a few loop iterations.  Return false otherwise.  */
1324
1325 static bool
1326 vector_alignment_reachable_p (dr_vec_info *dr_info)
1327 {
1328   stmt_vec_info stmt_info = dr_info->stmt;
1329   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1330
1331   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1332     {
1333       /* For interleaved access we peel only if number of iterations in
1334          the prolog loop ({VF - misalignment}), is a multiple of the
1335          number of the interleaved accesses.  */
1336       int elem_size, mis_in_elements;
1337
1338       /* FORNOW: handle only known alignment.  */
1339       if (!known_alignment_for_access_p (dr_info, vectype))
1340         return false;
1341
1342       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1343       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1344       elem_size = vector_element_size (vector_size, nelements);
1345       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1346
1347       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1348         return false;
1349     }
1350
1351   /* If misalignment is known at the compile time then allow peeling
1352      only if natural alignment is reachable through peeling.  */
1353   if (known_alignment_for_access_p (dr_info, vectype)
1354       && !aligned_access_p (dr_info, vectype))
1355     {
1356       HOST_WIDE_INT elmsize =
1357                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1358       if (dump_enabled_p ())
1359         {
1360           dump_printf_loc (MSG_NOTE, vect_location,
1361                            "data size = %wd. misalignment = %d.\n", elmsize,
1362                            dr_misalignment (dr_info, vectype));
1363         }
1364       if (dr_misalignment (dr_info, vectype) % elmsize)
1365         {
1366           if (dump_enabled_p ())
1367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1368                              "data size does not divide the misalignment.\n");
1369           return false;
1370         }
1371     }
1372
1373   if (!known_alignment_for_access_p (dr_info, vectype))
1374     {
1375       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1376       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1377       if (dump_enabled_p ())
1378         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1379                          "Unknown misalignment, %snaturally aligned\n",
1380                          is_packed ? "not " : "");
1381       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1382     }
1383
1384   return true;
1385 }
1386
1387
1388 /* Calculate the cost of the memory access represented by DR_INFO.  */
1389
1390 static void
1391 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1392                            dr_alignment_support alignment_support_scheme,
1393                            int misalignment,
1394                            unsigned int *inside_cost,
1395                            unsigned int *outside_cost,
1396                            stmt_vector_for_cost *body_cost_vec,
1397                            stmt_vector_for_cost *prologue_cost_vec)
1398 {
1399   stmt_vec_info stmt_info = dr_info->stmt;
1400   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1401   int ncopies;
1402
1403   if (PURE_SLP_STMT (stmt_info))
1404     ncopies = 1;
1405   else
1406     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1407
1408   if (DR_IS_READ (dr_info->dr))
1409     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1410                         misalignment, true, inside_cost,
1411                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1412   else
1413     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1414                          misalignment, inside_cost, body_cost_vec);
1415
1416   if (dump_enabled_p ())
1417     dump_printf_loc (MSG_NOTE, vect_location,
1418                      "vect_get_data_access_cost: inside_cost = %d, "
1419                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1420 }
1421
1422
1423 typedef struct _vect_peel_info
1424 {
1425   dr_vec_info *dr_info;
1426   int npeel;
1427   unsigned int count;
1428 } *vect_peel_info;
1429
1430 typedef struct _vect_peel_extended_info
1431 {
1432   vec_info *vinfo;
1433   struct _vect_peel_info peel_info;
1434   unsigned int inside_cost;
1435   unsigned int outside_cost;
1436 } *vect_peel_extended_info;
1437
1438
1439 /* Peeling hashtable helpers.  */
1440
1441 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1442 {
1443   static inline hashval_t hash (const _vect_peel_info *);
1444   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1445 };
1446
1447 inline hashval_t
1448 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1449 {
1450   return (hashval_t) peel_info->npeel;
1451 }
1452
1453 inline bool
1454 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1455 {
1456   return (a->npeel == b->npeel);
1457 }
1458
1459
1460 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1461
1462 static void
1463 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1464                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1465                           int npeel, bool supportable_if_not_aligned)
1466 {
1467   struct _vect_peel_info elem, *slot;
1468   _vect_peel_info **new_slot;
1469
1470   elem.npeel = npeel;
1471   slot = peeling_htab->find (&elem);
1472   if (slot)
1473     slot->count++;
1474   else
1475     {
1476       slot = XNEW (struct _vect_peel_info);
1477       slot->npeel = npeel;
1478       slot->dr_info = dr_info;
1479       slot->count = 1;
1480       new_slot = peeling_htab->find_slot (slot, INSERT);
1481       *new_slot = slot;
1482     }
1483
1484   /* If this DR is not supported with unknown misalignment then bias
1485      this slot when the cost model is disabled.  */
1486   if (!supportable_if_not_aligned
1487       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1488     slot->count += VECT_MAX_COST;
1489 }
1490
1491
1492 /* Traverse peeling hash table to find peeling option that aligns maximum
1493    number of data accesses.  */
1494
1495 int
1496 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1497                                      _vect_peel_extended_info *max)
1498 {
1499   vect_peel_info elem = *slot;
1500
1501   if (elem->count > max->peel_info.count
1502       || (elem->count == max->peel_info.count
1503           && max->peel_info.npeel > elem->npeel))
1504     {
1505       max->peel_info.npeel = elem->npeel;
1506       max->peel_info.count = elem->count;
1507       max->peel_info.dr_info = elem->dr_info;
1508     }
1509
1510   return 1;
1511 }
1512
1513 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1514    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1515    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1516    after peeling.  */
1517
1518 static void
1519 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1520                                 dr_vec_info *dr0_info,
1521                                 unsigned int *inside_cost,
1522                                 unsigned int *outside_cost,
1523                                 stmt_vector_for_cost *body_cost_vec,
1524                                 stmt_vector_for_cost *prologue_cost_vec,
1525                                 unsigned int npeel)
1526 {
1527   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1528
1529   bool dr0_alignment_known_p
1530     = (dr0_info
1531        && known_alignment_for_access_p (dr0_info,
1532                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1533
1534   for (data_reference *dr : datarefs)
1535     {
1536       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1537       if (!vect_relevant_for_alignment_p (dr_info))
1538         continue;
1539
1540       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1541       dr_alignment_support alignment_support_scheme;
1542       int misalignment;
1543       unsigned HOST_WIDE_INT alignment;
1544
1545       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1546                                             size_zero_node) < 0;
1547       poly_int64 off = 0;
1548       if (negative)
1549         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1550                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1551
1552       if (npeel == 0)
1553         misalignment = dr_misalignment (dr_info, vectype, off);
1554       else if (dr_info == dr0_info
1555                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1556         misalignment = 0;
1557       else if (!dr0_alignment_known_p
1558                || !known_alignment_for_access_p (dr_info, vectype)
1559                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1560         misalignment = DR_MISALIGNMENT_UNKNOWN;
1561       else
1562         {
1563           misalignment = dr_misalignment (dr_info, vectype, off);
1564           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1565           misalignment &= alignment - 1;
1566         }
1567       alignment_support_scheme
1568         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1569                                          misalignment);
1570
1571       vect_get_data_access_cost (loop_vinfo, dr_info,
1572                                  alignment_support_scheme, misalignment,
1573                                  inside_cost, outside_cost,
1574                                  body_cost_vec, prologue_cost_vec);
1575     }
1576 }
1577
1578 /* Traverse peeling hash table and calculate cost for each peeling option.
1579    Find the one with the lowest cost.  */
1580
1581 int
1582 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1583                                    _vect_peel_extended_info *min)
1584 {
1585   vect_peel_info elem = *slot;
1586   int dummy;
1587   unsigned int inside_cost = 0, outside_cost = 0;
1588   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1589   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1590                        epilogue_cost_vec;
1591
1592   prologue_cost_vec.create (2);
1593   body_cost_vec.create (2);
1594   epilogue_cost_vec.create (2);
1595
1596   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1597                                   &outside_cost, &body_cost_vec,
1598                                   &prologue_cost_vec, elem->npeel);
1599
1600   body_cost_vec.release ();
1601
1602   outside_cost += vect_get_known_peeling_cost
1603     (loop_vinfo, elem->npeel, &dummy,
1604      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1605      &prologue_cost_vec, &epilogue_cost_vec);
1606
1607   /* Prologue and epilogue costs are added to the target model later.
1608      These costs depend only on the scalar iteration cost, the
1609      number of peeling iterations finally chosen, and the number of
1610      misaligned statements.  So discard the information found here.  */
1611   prologue_cost_vec.release ();
1612   epilogue_cost_vec.release ();
1613
1614   if (inside_cost < min->inside_cost
1615       || (inside_cost == min->inside_cost
1616           && outside_cost < min->outside_cost))
1617     {
1618       min->inside_cost = inside_cost;
1619       min->outside_cost = outside_cost;
1620       min->peel_info.dr_info = elem->dr_info;
1621       min->peel_info.npeel = elem->npeel;
1622       min->peel_info.count = elem->count;
1623     }
1624
1625   return 1;
1626 }
1627
1628
1629 /* Choose best peeling option by traversing peeling hash table and either
1630    choosing an option with the lowest cost (if cost model is enabled) or the
1631    option that aligns as many accesses as possible.  */
1632
1633 static struct _vect_peel_extended_info
1634 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1635                                        loop_vec_info loop_vinfo)
1636 {
1637    struct _vect_peel_extended_info res;
1638
1639    res.peel_info.dr_info = NULL;
1640    res.vinfo = loop_vinfo;
1641
1642    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1643      {
1644        res.inside_cost = INT_MAX;
1645        res.outside_cost = INT_MAX;
1646        peeling_htab->traverse <_vect_peel_extended_info *,
1647                                vect_peeling_hash_get_lowest_cost> (&res);
1648      }
1649    else
1650      {
1651        res.peel_info.count = 0;
1652        peeling_htab->traverse <_vect_peel_extended_info *,
1653                                vect_peeling_hash_get_most_frequent> (&res);
1654        res.inside_cost = 0;
1655        res.outside_cost = 0;
1656      }
1657
1658    return res;
1659 }
1660
1661 /* Return true if the new peeling NPEEL is supported.  */
1662
1663 static bool
1664 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1665                           unsigned npeel)
1666 {
1667   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1668   enum dr_alignment_support supportable_dr_alignment;
1669
1670   bool dr0_alignment_known_p
1671     = known_alignment_for_access_p (dr0_info,
1672                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1673
1674   /* Ensure that all data refs can be vectorized after the peel.  */
1675   for (data_reference *dr : datarefs)
1676     {
1677       if (dr == dr0_info->dr)
1678         continue;
1679
1680       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1681       if (!vect_relevant_for_alignment_p (dr_info)
1682           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1683         continue;
1684
1685       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1686       int misalignment;
1687       unsigned HOST_WIDE_INT alignment;
1688       if (!dr0_alignment_known_p
1689           || !known_alignment_for_access_p (dr_info, vectype)
1690           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1691         misalignment = DR_MISALIGNMENT_UNKNOWN;
1692       else
1693         {
1694           misalignment = dr_misalignment (dr_info, vectype);
1695           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1696           misalignment &= alignment - 1;
1697         }
1698       supportable_dr_alignment
1699         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1700                                          misalignment);
1701       if (supportable_dr_alignment == dr_unaligned_unsupported)
1702         return false;
1703     }
1704
1705   return true;
1706 }
1707
1708 /* Compare two data-references DRA and DRB to group them into chunks
1709    with related alignment.  */
1710
1711 static int
1712 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1713 {
1714   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1715   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1716   int cmp;
1717
1718   /* Stabilize sort.  */
1719   if (dra == drb)
1720     return 0;
1721
1722   /* Ordering of DRs according to base.  */
1723   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1724                                DR_BASE_ADDRESS (drb));
1725   if (cmp != 0)
1726     return cmp;
1727
1728   /* And according to DR_OFFSET.  */
1729   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1730   if (cmp != 0)
1731     return cmp;
1732
1733   /* And after step.  */
1734   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1735   if (cmp != 0)
1736     return cmp;
1737
1738   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1739   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1740   if (cmp == 0)
1741     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1742   return cmp;
1743 }
1744
1745 /* Function vect_enhance_data_refs_alignment
1746
1747    This pass will use loop versioning and loop peeling in order to enhance
1748    the alignment of data references in the loop.
1749
1750    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1751    original loop is to be vectorized.  Any other loops that are created by
1752    the transformations performed in this pass - are not supposed to be
1753    vectorized.  This restriction will be relaxed.
1754
1755    This pass will require a cost model to guide it whether to apply peeling
1756    or versioning or a combination of the two.  For example, the scheme that
1757    intel uses when given a loop with several memory accesses, is as follows:
1758    choose one memory access ('p') which alignment you want to force by doing
1759    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1760    other accesses are not necessarily aligned, or (2) use loop versioning to
1761    generate one loop in which all accesses are aligned, and another loop in
1762    which only 'p' is necessarily aligned.
1763
1764    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1765    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1766    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1767
1768    Devising a cost model is the most critical aspect of this work.  It will
1769    guide us on which access to peel for, whether to use loop versioning, how
1770    many versions to create, etc.  The cost model will probably consist of
1771    generic considerations as well as target specific considerations (on
1772    powerpc for example, misaligned stores are more painful than misaligned
1773    loads).
1774
1775    Here are the general steps involved in alignment enhancements:
1776
1777      -- original loop, before alignment analysis:
1778         for (i=0; i<N; i++){
1779           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1780           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1781         }
1782
1783      -- After vect_compute_data_refs_alignment:
1784         for (i=0; i<N; i++){
1785           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1786           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1787         }
1788
1789      -- Possibility 1: we do loop versioning:
1790      if (p is aligned) {
1791         for (i=0; i<N; i++){    # loop 1A
1792           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1793           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1794         }
1795      }
1796      else {
1797         for (i=0; i<N; i++){    # loop 1B
1798           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1799           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1800         }
1801      }
1802
1803      -- Possibility 2: we do loop peeling:
1804      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1805         x = q[i];
1806         p[i] = y;
1807      }
1808      for (i = 3; i < N; i++){   # loop 2A
1809         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1810         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1811      }
1812
1813      -- Possibility 3: combination of loop peeling and versioning:
1814      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1815         x = q[i];
1816         p[i] = y;
1817      }
1818      if (p is aligned) {
1819         for (i = 3; i<N; i++){  # loop 3A
1820           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1821           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1822         }
1823      }
1824      else {
1825         for (i = 3; i<N; i++){  # loop 3B
1826           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1827           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1828         }
1829      }
1830
1831      These loops are later passed to loop_transform to be vectorized.  The
1832      vectorizer will use the alignment information to guide the transformation
1833      (whether to generate regular loads/stores, or with special handling for
1834      misalignment).  */
1835
1836 opt_result
1837 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1838 {
1839   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1840   dr_vec_info *first_store = NULL;
1841   dr_vec_info *dr0_info = NULL;
1842   struct data_reference *dr;
1843   unsigned int i;
1844   bool do_peeling = false;
1845   bool do_versioning = false;
1846   unsigned int npeel = 0;
1847   bool one_misalignment_known = false;
1848   bool one_misalignment_unknown = false;
1849   bool one_dr_unsupportable = false;
1850   dr_vec_info *unsupportable_dr_info = NULL;
1851   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1852   hash_table<peel_info_hasher> peeling_htab (1);
1853
1854   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1855
1856   /* Reset data so we can safely be called multiple times.  */
1857   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1858   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1859
1860   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1861     return opt_result::success ();
1862
1863   /* Sort the vector of datarefs so DRs that have the same or dependent
1864      alignment are next to each other.  */
1865   auto_vec<data_reference_p> datarefs
1866     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1867   datarefs.qsort (dr_align_group_sort_cmp);
1868
1869   /* Compute the number of DRs that become aligned when we peel
1870      a dataref so it becomes aligned.  */
1871   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1872   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1873   unsigned i0;
1874   for (i0 = 0; i0 < datarefs.length (); ++i0)
1875     if (DR_BASE_ADDRESS (datarefs[i0]))
1876       break;
1877   for (i = i0 + 1; i <= datarefs.length (); ++i)
1878     {
1879       if (i == datarefs.length ()
1880           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1881                                DR_BASE_ADDRESS (datarefs[i]), 0)
1882           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1883                                DR_OFFSET (datarefs[i]), 0)
1884           || !operand_equal_p (DR_STEP (datarefs[i0]),
1885                                DR_STEP (datarefs[i]), 0))
1886         {
1887           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1888              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1889              will get known misalignment if we align one of the refs
1890              with the largest DR_TARGET_ALIGNMENT.  */
1891           for (unsigned j = i0; j < i; ++j)
1892             {
1893               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1894               for (unsigned k = i0; k < i; ++k)
1895                 {
1896                   if (k == j)
1897                     continue;
1898                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1899                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1900                                                                dr_infoj))
1901                     n_same_align_refs[j]++;
1902                 }
1903             }
1904           i0 = i;
1905         }
1906     }
1907
1908   /* While cost model enhancements are expected in the future, the high level
1909      view of the code at this time is as follows:
1910
1911      A) If there is a misaligned access then see if peeling to align
1912         this access can make all data references satisfy
1913         vect_supportable_dr_alignment.  If so, update data structures
1914         as needed and return true.
1915
1916      B) If peeling wasn't possible and there is a data reference with an
1917         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1918         then see if loop versioning checks can be used to make all data
1919         references satisfy vect_supportable_dr_alignment.  If so, update
1920         data structures as needed and return true.
1921
1922      C) If neither peeling nor versioning were successful then return false if
1923         any data reference does not satisfy vect_supportable_dr_alignment.
1924
1925      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1926
1927      Note, Possibility 3 above (which is peeling and versioning together) is not
1928      being done at this time.  */
1929
1930   /* (1) Peeling to force alignment.  */
1931
1932   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1933      Considerations:
1934      + How many accesses will become aligned due to the peeling
1935      - How many accesses will become unaligned due to the peeling,
1936        and the cost of misaligned accesses.
1937      - The cost of peeling (the extra runtime checks, the increase
1938        in code size).  */
1939
1940   FOR_EACH_VEC_ELT (datarefs, i, dr)
1941     {
1942       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1943       if (!vect_relevant_for_alignment_p (dr_info))
1944         continue;
1945
1946       stmt_vec_info stmt_info = dr_info->stmt;
1947       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1948       do_peeling = vector_alignment_reachable_p (dr_info);
1949       if (do_peeling)
1950         {
1951           if (known_alignment_for_access_p (dr_info, vectype))
1952             {
1953               unsigned int npeel_tmp = 0;
1954               bool negative = tree_int_cst_compare (DR_STEP (dr),
1955                                                     size_zero_node) < 0;
1956
1957               /* If known_alignment_for_access_p then we have set
1958                  DR_MISALIGNMENT which is only done if we know it at compiler
1959                  time, so it is safe to assume target alignment is constant.
1960                */
1961               unsigned int target_align =
1962                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1963               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1964               poly_int64 off = 0;
1965               if (negative)
1966                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1967               unsigned int mis = dr_misalignment (dr_info, vectype, off);
1968               mis = negative ? mis : -mis;
1969               if (mis != 0)
1970                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1971
1972               /* For multiple types, it is possible that the bigger type access
1973                  will have more than one peeling option.  E.g., a loop with two
1974                  types: one of size (vector size / 4), and the other one of
1975                  size (vector size / 8).  Vectorization factor will 8.  If both
1976                  accesses are misaligned by 3, the first one needs one scalar
1977                  iteration to be aligned, and the second one needs 5.  But the
1978                  first one will be aligned also by peeling 5 scalar
1979                  iterations, and in that case both accesses will be aligned.
1980                  Hence, except for the immediate peeling amount, we also want
1981                  to try to add full vector size, while we don't exceed
1982                  vectorization factor.
1983                  We do this automatically for cost model, since we calculate
1984                  cost for every peeling option.  */
1985               poly_uint64 nscalars = npeel_tmp;
1986               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1987                 {
1988                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989                   nscalars = (STMT_SLP_TYPE (stmt_info)
1990                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1991                 }
1992
1993               /* Save info about DR in the hash table.  Also include peeling
1994                  amounts according to the explanation above.  Indicate
1995                  the alignment status when the ref is not aligned.
1996                  ???  Rather than using unknown alignment here we should
1997                  prune all entries from the peeling hashtable which cause
1998                  DRs to be not supported.  */
1999               bool supportable_if_not_aligned
2000                 = vect_supportable_dr_alignment
2001                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2002               while (known_le (npeel_tmp, nscalars))
2003                 {
2004                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2005                                             dr_info, npeel_tmp,
2006                                             supportable_if_not_aligned);
2007                   npeel_tmp += MAX (1, target_align / dr_size);
2008                 }
2009
2010               one_misalignment_known = true;
2011             }
2012           else
2013             {
2014               /* If we don't know any misalignment values, we prefer
2015                  peeling for data-ref that has the maximum number of data-refs
2016                  with the same alignment, unless the target prefers to align
2017                  stores over load.  */
2018               unsigned same_align_drs = n_same_align_refs[i];
2019               if (!dr0_info
2020                   || dr0_same_align_drs < same_align_drs)
2021                 {
2022                   dr0_same_align_drs = same_align_drs;
2023                   dr0_info = dr_info;
2024                 }
2025               /* For data-refs with the same number of related
2026                  accesses prefer the one where the misalign
2027                  computation will be invariant in the outermost loop.  */
2028               else if (dr0_same_align_drs == same_align_drs)
2029                 {
2030                   class loop *ivloop0, *ivloop;
2031                   ivloop0 = outermost_invariant_loop_for_expr
2032                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2033                   ivloop = outermost_invariant_loop_for_expr
2034                     (loop, DR_BASE_ADDRESS (dr));
2035                   if ((ivloop && !ivloop0)
2036                       || (ivloop && ivloop0
2037                           && flow_loop_nested_p (ivloop, ivloop0)))
2038                     dr0_info = dr_info;
2039                 }
2040
2041               one_misalignment_unknown = true;
2042
2043               /* Check for data refs with unsupportable alignment that
2044                  can be peeled.  */
2045               enum dr_alignment_support supportable_dr_alignment
2046                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2047                                                  DR_MISALIGNMENT_UNKNOWN);
2048               if (supportable_dr_alignment == dr_unaligned_unsupported)
2049                 {
2050                   one_dr_unsupportable = true;
2051                   unsupportable_dr_info = dr_info;
2052                 }
2053
2054               if (!first_store && DR_IS_WRITE (dr))
2055                 {
2056                   first_store = dr_info;
2057                   first_store_same_align_drs = same_align_drs;
2058                 }
2059             }
2060         }
2061       else
2062         {
2063           if (!aligned_access_p (dr_info, vectype))
2064             {
2065               if (dump_enabled_p ())
2066                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2067                                  "vector alignment may not be reachable\n");
2068               break;
2069             }
2070         }
2071     }
2072
2073   /* Check if we can possibly peel the loop.  */
2074   if (!vect_can_advance_ivs_p (loop_vinfo)
2075       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2076       || loop->inner)
2077     do_peeling = false;
2078
2079   struct _vect_peel_extended_info peel_for_known_alignment;
2080   struct _vect_peel_extended_info peel_for_unknown_alignment;
2081   struct _vect_peel_extended_info best_peel;
2082
2083   peel_for_unknown_alignment.inside_cost = INT_MAX;
2084   peel_for_unknown_alignment.outside_cost = INT_MAX;
2085   peel_for_unknown_alignment.peel_info.count = 0;
2086
2087   if (do_peeling
2088       && one_misalignment_unknown)
2089     {
2090       /* Check if the target requires to prefer stores over loads, i.e., if
2091          misaligned stores are more expensive than misaligned loads (taking
2092          drs with same alignment into account).  */
2093       unsigned int load_inside_cost = 0;
2094       unsigned int load_outside_cost = 0;
2095       unsigned int store_inside_cost = 0;
2096       unsigned int store_outside_cost = 0;
2097       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2098
2099       stmt_vector_for_cost dummy;
2100       dummy.create (2);
2101       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2102                                       &load_inside_cost,
2103                                       &load_outside_cost,
2104                                       &dummy, &dummy, estimated_npeels);
2105       dummy.release ();
2106
2107       if (first_store)
2108         {
2109           dummy.create (2);
2110           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2111                                           &store_inside_cost,
2112                                           &store_outside_cost,
2113                                           &dummy, &dummy,
2114                                           estimated_npeels);
2115           dummy.release ();
2116         }
2117       else
2118         {
2119           store_inside_cost = INT_MAX;
2120           store_outside_cost = INT_MAX;
2121         }
2122
2123       if (load_inside_cost > store_inside_cost
2124           || (load_inside_cost == store_inside_cost
2125               && load_outside_cost > store_outside_cost))
2126         {
2127           dr0_info = first_store;
2128           dr0_same_align_drs = first_store_same_align_drs;
2129           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2130           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2131         }
2132       else
2133         {
2134           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2135           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2136         }
2137
2138       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2139       prologue_cost_vec.create (2);
2140       epilogue_cost_vec.create (2);
2141
2142       int dummy2;
2143       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2144         (loop_vinfo, estimated_npeels, &dummy2,
2145          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2146          &prologue_cost_vec, &epilogue_cost_vec);
2147
2148       prologue_cost_vec.release ();
2149       epilogue_cost_vec.release ();
2150
2151       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2152     }
2153
2154   peel_for_unknown_alignment.peel_info.npeel = 0;
2155   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2156
2157   best_peel = peel_for_unknown_alignment;
2158
2159   peel_for_known_alignment.inside_cost = INT_MAX;
2160   peel_for_known_alignment.outside_cost = INT_MAX;
2161   peel_for_known_alignment.peel_info.count = 0;
2162   peel_for_known_alignment.peel_info.dr_info = NULL;
2163
2164   if (do_peeling && one_misalignment_known)
2165     {
2166       /* Peeling is possible, but there is no data access that is not supported
2167          unless aligned.  So we try to choose the best possible peeling from
2168          the hash table.  */
2169       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2170         (&peeling_htab, loop_vinfo);
2171     }
2172
2173   /* Compare costs of peeling for known and unknown alignment. */
2174   if (peel_for_known_alignment.peel_info.dr_info != NULL
2175       && peel_for_unknown_alignment.inside_cost
2176       >= peel_for_known_alignment.inside_cost)
2177     {
2178       best_peel = peel_for_known_alignment;
2179
2180       /* If the best peeling for known alignment has NPEEL == 0, perform no
2181          peeling at all except if there is an unsupportable dr that we can
2182          align.  */
2183       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2184         do_peeling = false;
2185     }
2186
2187   /* If there is an unsupportable data ref, prefer this over all choices so far
2188      since we'd have to discard a chosen peeling except when it accidentally
2189      aligned the unsupportable data ref.  */
2190   if (one_dr_unsupportable)
2191     dr0_info = unsupportable_dr_info;
2192   else if (do_peeling)
2193     {
2194       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2195          TODO: Use nopeel_outside_cost or get rid of it?  */
2196       unsigned nopeel_inside_cost = 0;
2197       unsigned nopeel_outside_cost = 0;
2198
2199       stmt_vector_for_cost dummy;
2200       dummy.create (2);
2201       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2202                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2203       dummy.release ();
2204
2205       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2206          costs will be recorded.  */
2207       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2208       prologue_cost_vec.create (2);
2209       epilogue_cost_vec.create (2);
2210
2211       int dummy2;
2212       nopeel_outside_cost += vect_get_known_peeling_cost
2213         (loop_vinfo, 0, &dummy2,
2214          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2215          &prologue_cost_vec, &epilogue_cost_vec);
2216
2217       prologue_cost_vec.release ();
2218       epilogue_cost_vec.release ();
2219
2220       npeel = best_peel.peel_info.npeel;
2221       dr0_info = best_peel.peel_info.dr_info;
2222
2223       /* If no peeling is not more expensive than the best peeling we
2224          have so far, don't perform any peeling.  */
2225       if (nopeel_inside_cost <= best_peel.inside_cost)
2226         do_peeling = false;
2227     }
2228
2229   if (do_peeling)
2230     {
2231       stmt_vec_info stmt_info = dr0_info->stmt;
2232       if (known_alignment_for_access_p (dr0_info,
2233                                         STMT_VINFO_VECTYPE (stmt_info)))
2234         {
2235           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2236                                                 size_zero_node) < 0;
2237           if (!npeel)
2238             {
2239               /* Since it's known at compile time, compute the number of
2240                  iterations in the peeled loop (the peeling factor) for use in
2241                  updating DR_MISALIGNMENT values.  The peeling factor is the
2242                  vectorization factor minus the misalignment as an element
2243                  count.  */
2244               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2245               poly_int64 off = 0;
2246               if (negative)
2247                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2248                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2249               unsigned int mis
2250                 = dr_misalignment (dr0_info, vectype, off);
2251               mis = negative ? mis : -mis;
2252               /* If known_alignment_for_access_p then we have set
2253                  DR_MISALIGNMENT which is only done if we know it at compiler
2254                  time, so it is safe to assume target alignment is constant.
2255                */
2256               unsigned int target_align =
2257                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2258               npeel = ((mis & (target_align - 1))
2259                        / vect_get_scalar_dr_size (dr0_info));
2260             }
2261
2262           /* For interleaved data access every iteration accesses all the
2263              members of the group, therefore we divide the number of iterations
2264              by the group size.  */
2265           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2266             npeel /= DR_GROUP_SIZE (stmt_info);
2267
2268           if (dump_enabled_p ())
2269             dump_printf_loc (MSG_NOTE, vect_location,
2270                              "Try peeling by %d\n", npeel);
2271         }
2272
2273       /* Ensure that all datarefs can be vectorized after the peel.  */
2274       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2275         do_peeling = false;
2276
2277       /* Check if all datarefs are supportable and log.  */
2278       if (do_peeling
2279           && npeel == 0
2280           && known_alignment_for_access_p (dr0_info,
2281                                            STMT_VINFO_VECTYPE (stmt_info)))
2282         return opt_result::success ();
2283
2284       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2285       if (do_peeling)
2286         {
2287           unsigned max_allowed_peel
2288             = param_vect_max_peeling_for_alignment;
2289           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2290             max_allowed_peel = 0;
2291           if (max_allowed_peel != (unsigned)-1)
2292             {
2293               unsigned max_peel = npeel;
2294               if (max_peel == 0)
2295                 {
2296                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2297                   unsigned HOST_WIDE_INT target_align_c;
2298                   if (target_align.is_constant (&target_align_c))
2299                     max_peel =
2300                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2301                   else
2302                     {
2303                       do_peeling = false;
2304                       if (dump_enabled_p ())
2305                         dump_printf_loc (MSG_NOTE, vect_location,
2306                           "Disable peeling, max peels set and vector"
2307                           " alignment unknown\n");
2308                     }
2309                 }
2310               if (max_peel > max_allowed_peel)
2311                 {
2312                   do_peeling = false;
2313                   if (dump_enabled_p ())
2314                     dump_printf_loc (MSG_NOTE, vect_location,
2315                         "Disable peeling, max peels reached: %d\n", max_peel);
2316                 }
2317             }
2318         }
2319
2320       /* Cost model #2 - if peeling may result in a remaining loop not
2321          iterating enough to be vectorized then do not peel.  Since this
2322          is a cost heuristic rather than a correctness decision, use the
2323          most likely runtime value for variable vectorization factors.  */
2324       if (do_peeling
2325           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2326         {
2327           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2328           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2329           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2330               < assumed_vf + max_peel)
2331             do_peeling = false;
2332         }
2333
2334       if (do_peeling)
2335         {
2336           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2337              If the misalignment of DR_i is identical to that of dr0 then set
2338              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2339              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2340              by the peeling factor times the element size of DR_i (MOD the
2341              vectorization factor times the size).  Otherwise, the
2342              misalignment of DR_i must be set to unknown.  */
2343           FOR_EACH_VEC_ELT (datarefs, i, dr)
2344             if (dr != dr0_info->dr)
2345               {
2346                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2347                 if (!vect_relevant_for_alignment_p (dr_info))
2348                   continue;
2349
2350                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2351               }
2352
2353           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2354           if (npeel)
2355             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2356           else
2357             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2358           SET_DR_MISALIGNMENT (dr0_info,
2359                                vect_dr_misalign_for_aligned_access (dr0_info));
2360           if (dump_enabled_p ())
2361             {
2362               dump_printf_loc (MSG_NOTE, vect_location,
2363                                "Alignment of access forced using peeling.\n");
2364               dump_printf_loc (MSG_NOTE, vect_location,
2365                                "Peeling for alignment will be applied.\n");
2366             }
2367
2368           /* The inside-loop cost will be accounted for in vectorizable_load
2369              and vectorizable_store correctly with adjusted alignments.
2370              Drop the body_cst_vec on the floor here.  */
2371           return opt_result::success ();
2372         }
2373     }
2374
2375   /* (2) Versioning to force alignment.  */
2376
2377   /* Try versioning if:
2378      1) optimize loop for speed and the cost-model is not cheap
2379      2) there is at least one unsupported misaligned data ref with an unknown
2380         misalignment, and
2381      3) all misaligned data refs with a known misalignment are supported, and
2382      4) the number of runtime alignment checks is within reason.  */
2383
2384   do_versioning
2385     = (optimize_loop_nest_for_speed_p (loop)
2386        && !loop->inner /* FORNOW */
2387        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2388
2389   if (do_versioning)
2390     {
2391       FOR_EACH_VEC_ELT (datarefs, i, dr)
2392         {
2393           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2394           if (!vect_relevant_for_alignment_p (dr_info))
2395             continue;
2396
2397           stmt_vec_info stmt_info = dr_info->stmt;
2398           if (STMT_VINFO_STRIDED_P (stmt_info))
2399             {
2400               do_versioning = false;
2401               break;
2402             }
2403
2404           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2405           bool negative = tree_int_cst_compare (DR_STEP (dr),
2406                                                 size_zero_node) < 0;
2407           poly_int64 off = 0;
2408           if (negative)
2409             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2410                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2411           int misalignment;
2412           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2413             continue;
2414
2415           enum dr_alignment_support supportable_dr_alignment
2416             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2417                                              misalignment);
2418           if (supportable_dr_alignment == dr_unaligned_unsupported)
2419             {
2420               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2421                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2422                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2423                 {
2424                   do_versioning = false;
2425                   break;
2426                 }
2427
2428               /* At present we don't support versioning for alignment
2429                  with variable VF, since there's no guarantee that the
2430                  VF is a power of two.  We could relax this if we added
2431                  a way of enforcing a power-of-two size.  */
2432               unsigned HOST_WIDE_INT size;
2433               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2434                 {
2435                   do_versioning = false;
2436                   break;
2437                 }
2438
2439               /* Forcing alignment in the first iteration is no good if
2440                  we don't keep it across iterations.  For now, just disable
2441                  versioning in this case.
2442                  ?? We could actually unroll the loop to achieve the required
2443                  overall step alignment, and forcing the alignment could be
2444                  done by doing some iterations of the non-vectorized loop.  */
2445               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2446                                * DR_STEP_ALIGNMENT (dr),
2447                                DR_TARGET_ALIGNMENT (dr_info)))
2448                 {
2449                   do_versioning = false;
2450                   break;
2451                 }
2452
2453               /* The rightmost bits of an aligned address must be zeros.
2454                  Construct the mask needed for this test.  For example,
2455                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2456                  mask must be 15 = 0xf. */
2457               int mask = size - 1;
2458
2459               /* FORNOW: use the same mask to test all potentially unaligned
2460                  references in the loop.  */
2461               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2462                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2463                 {
2464                   do_versioning = false;
2465                   break;
2466                 }
2467
2468               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2469               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2470             }
2471         }
2472
2473       /* Versioning requires at least one misaligned data reference.  */
2474       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2475         do_versioning = false;
2476       else if (!do_versioning)
2477         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2478     }
2479
2480   if (do_versioning)
2481     {
2482       const vec<stmt_vec_info> &may_misalign_stmts
2483         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2484       stmt_vec_info stmt_info;
2485
2486       /* It can now be assumed that the data references in the statements
2487          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2488          of the loop being vectorized.  */
2489       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2490         {
2491           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2492           SET_DR_MISALIGNMENT (dr_info,
2493                                vect_dr_misalign_for_aligned_access (dr_info));
2494           if (dump_enabled_p ())
2495             dump_printf_loc (MSG_NOTE, vect_location,
2496                              "Alignment of access forced using versioning.\n");
2497         }
2498
2499       if (dump_enabled_p ())
2500         dump_printf_loc (MSG_NOTE, vect_location,
2501                          "Versioning for alignment will be applied.\n");
2502
2503       /* Peeling and versioning can't be done together at this time.  */
2504       gcc_assert (! (do_peeling && do_versioning));
2505
2506       return opt_result::success ();
2507     }
2508
2509   /* This point is reached if neither peeling nor versioning is being done.  */
2510   gcc_assert (! (do_peeling || do_versioning));
2511
2512   return opt_result::success ();
2513 }
2514
2515
2516 /* Function vect_analyze_data_refs_alignment
2517
2518    Analyze the alignment of the data-references in the loop.
2519    Return FALSE if a data reference is found that cannot be vectorized.  */
2520
2521 opt_result
2522 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2523 {
2524   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2525
2526   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2527   struct data_reference *dr;
2528   unsigned int i;
2529
2530   vect_record_base_alignments (loop_vinfo);
2531   FOR_EACH_VEC_ELT (datarefs, i, dr)
2532     {
2533       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2534       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2535         {
2536           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2537               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2538             continue;
2539           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2540                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2541         }
2542     }
2543
2544   return opt_result::success ();
2545 }
2546
2547
2548 /* Analyze alignment of DRs of stmts in NODE.  */
2549
2550 static bool
2551 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2552 {
2553   /* Alignment is maintained in the first element of the group.  */
2554   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2555   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2556   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2557   tree vectype = SLP_TREE_VECTYPE (node);
2558   poly_uint64 vector_alignment
2559     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2560                  BITS_PER_UNIT);
2561   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2562     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2563   /* Re-analyze alignment when we're facing a vectorization with a bigger
2564      alignment requirement.  */
2565   else if (known_lt (dr_info->target_alignment, vector_alignment))
2566     {
2567       poly_uint64 old_target_alignment = dr_info->target_alignment;
2568       int old_misalignment = dr_info->misalignment;
2569       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2570       /* But keep knowledge about a smaller alignment.  */
2571       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2572           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2573         {
2574           dr_info->target_alignment = old_target_alignment;
2575           dr_info->misalignment = old_misalignment;
2576         }
2577     }
2578   /* When we ever face unordered target alignments the first one wins in terms
2579      of analyzing and the other will become unknown in dr_misalignment.  */
2580   return true;
2581 }
2582
2583 /* Function vect_slp_analyze_instance_alignment
2584
2585    Analyze the alignment of the data-references in the SLP instance.
2586    Return FALSE if a data reference is found that cannot be vectorized.  */
2587
2588 bool
2589 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2590                                                 slp_instance instance)
2591 {
2592   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2593
2594   slp_tree node;
2595   unsigned i;
2596   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2597     if (! vect_slp_analyze_node_alignment (vinfo, node))
2598       return false;
2599
2600   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2601       && ! vect_slp_analyze_node_alignment
2602              (vinfo, SLP_INSTANCE_TREE (instance)))
2603     return false;
2604
2605   return true;
2606 }
2607
2608
2609 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2610    accesses of legal size, step, etc.  Detect gaps, single element
2611    interleaving, and other special cases. Set grouped access info.
2612    Collect groups of strided stores for further use in SLP analysis.
2613    Worker for vect_analyze_group_access.  */
2614
2615 static bool
2616 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2617 {
2618   data_reference *dr = dr_info->dr;
2619   tree step = DR_STEP (dr);
2620   tree scalar_type = TREE_TYPE (DR_REF (dr));
2621   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2622   stmt_vec_info stmt_info = dr_info->stmt;
2623   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2624   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2625   HOST_WIDE_INT dr_step = -1;
2626   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2627   bool slp_impossible = false;
2628
2629   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2630      size of the interleaving group (including gaps).  */
2631   if (tree_fits_shwi_p (step))
2632     {
2633       dr_step = tree_to_shwi (step);
2634       /* Check that STEP is a multiple of type size.  Otherwise there is
2635          a non-element-sized gap at the end of the group which we
2636          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2637          ???  As we can handle non-constant step fine here we should
2638          simply remove uses of DR_GROUP_GAP between the last and first
2639          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2640          simply not include that gap.  */
2641       if ((dr_step % type_size) != 0)
2642         {
2643           if (dump_enabled_p ())
2644             dump_printf_loc (MSG_NOTE, vect_location,
2645                              "Step %T is not a multiple of the element size"
2646                              " for %T\n",
2647                              step, DR_REF (dr));
2648           return false;
2649         }
2650       groupsize = absu_hwi (dr_step) / type_size;
2651     }
2652   else
2653     groupsize = 0;
2654
2655   /* Not consecutive access is possible only if it is a part of interleaving.  */
2656   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2657     {
2658       /* Check if it this DR is a part of interleaving, and is a single
2659          element of the group that is accessed in the loop.  */
2660
2661       /* Gaps are supported only for loads. STEP must be a multiple of the type
2662          size.  */
2663       if (DR_IS_READ (dr)
2664           && (dr_step % type_size) == 0
2665           && groupsize > 0
2666           /* This could be UINT_MAX but as we are generating code in a very
2667              inefficient way we have to cap earlier.
2668              See PR91403 for example.  */
2669           && groupsize <= 4096)
2670         {
2671           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2672           DR_GROUP_SIZE (stmt_info) = groupsize;
2673           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2674           if (dump_enabled_p ())
2675             dump_printf_loc (MSG_NOTE, vect_location,
2676                              "Detected single element interleaving %T"
2677                              " step %T\n",
2678                              DR_REF (dr), step);
2679
2680           return true;
2681         }
2682
2683       if (dump_enabled_p ())
2684         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2685                          "not consecutive access %G", stmt_info->stmt);
2686
2687       if (bb_vinfo)
2688         {
2689           /* Mark the statement as unvectorizable.  */
2690           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2691           return true;
2692         }
2693
2694       if (dump_enabled_p ())
2695         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2696       STMT_VINFO_STRIDED_P (stmt_info) = true;
2697       return true;
2698     }
2699
2700   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2701     {
2702       /* First stmt in the interleaving chain. Check the chain.  */
2703       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2704       struct data_reference *data_ref = dr;
2705       unsigned int count = 1;
2706       tree prev_init = DR_INIT (data_ref);
2707       HOST_WIDE_INT diff, gaps = 0;
2708
2709       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2710       while (next)
2711         {
2712           /* We never have the same DR multiple times.  */
2713           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2714                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2715
2716           data_ref = STMT_VINFO_DATA_REF (next);
2717
2718           /* All group members have the same STEP by construction.  */
2719           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2720
2721           /* Check that the distance between two accesses is equal to the type
2722              size. Otherwise, we have gaps.  */
2723           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2724                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2725           if (diff < 1 || diff > UINT_MAX)
2726             {
2727               /* For artificial testcases with array accesses with large
2728                  constant indices we can run into overflow issues which
2729                  can end up fooling the groupsize constraint below so
2730                  check the individual gaps (which are represented as
2731                  unsigned int) as well.  */
2732               if (dump_enabled_p ())
2733                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2734                                  "interleaved access with gap larger "
2735                                  "than representable\n");
2736               return false;
2737             }
2738           if (diff != 1)
2739             {
2740               /* FORNOW: SLP of accesses with gaps is not supported.  */
2741               slp_impossible = true;
2742               if (DR_IS_WRITE (data_ref))
2743                 {
2744                   if (dump_enabled_p ())
2745                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2746                                      "interleaved store with gaps\n");
2747                   return false;
2748                 }
2749
2750               gaps += diff - 1;
2751             }
2752
2753           last_accessed_element += diff;
2754
2755           /* Store the gap from the previous member of the group. If there is no
2756              gap in the access, DR_GROUP_GAP is always 1.  */
2757           DR_GROUP_GAP (next) = diff;
2758
2759           prev_init = DR_INIT (data_ref);
2760           next = DR_GROUP_NEXT_ELEMENT (next);
2761           /* Count the number of data-refs in the chain.  */
2762           count++;
2763         }
2764
2765       if (groupsize == 0)
2766         groupsize = count + gaps;
2767
2768       /* This could be UINT_MAX but as we are generating code in a very
2769          inefficient way we have to cap earlier.  See PR78699 for example.  */
2770       if (groupsize > 4096)
2771         {
2772           if (dump_enabled_p ())
2773             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2774                              "group is too large\n");
2775           return false;
2776         }
2777
2778       /* Check that the size of the interleaving is equal to count for stores,
2779          i.e., that there are no gaps.  */
2780       if (groupsize != count
2781           && !DR_IS_READ (dr))
2782         {
2783           groupsize = count;
2784           STMT_VINFO_STRIDED_P (stmt_info) = true;
2785         }
2786
2787       /* If there is a gap after the last load in the group it is the
2788          difference between the groupsize and the last accessed
2789          element.
2790          When there is no gap, this difference should be 0.  */
2791       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2792
2793       DR_GROUP_SIZE (stmt_info) = groupsize;
2794       if (dump_enabled_p ())
2795         {
2796           dump_printf_loc (MSG_NOTE, vect_location,
2797                            "Detected interleaving ");
2798           if (DR_IS_READ (dr))
2799             dump_printf (MSG_NOTE, "load ");
2800           else if (STMT_VINFO_STRIDED_P (stmt_info))
2801             dump_printf (MSG_NOTE, "strided store ");
2802           else
2803             dump_printf (MSG_NOTE, "store ");
2804           dump_printf (MSG_NOTE, "of size %u\n",
2805                        (unsigned)groupsize);
2806           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2807           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2808           while (next)
2809             {
2810               if (DR_GROUP_GAP (next) != 1)
2811                 dump_printf_loc (MSG_NOTE, vect_location,
2812                                  "\t<gap of %d elements>\n",
2813                                  DR_GROUP_GAP (next) - 1);
2814               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2815               next = DR_GROUP_NEXT_ELEMENT (next);
2816             }
2817           if (DR_GROUP_GAP (stmt_info) != 0)
2818             dump_printf_loc (MSG_NOTE, vect_location,
2819                              "\t<gap of %d elements>\n",
2820                              DR_GROUP_GAP (stmt_info));
2821         }
2822
2823       /* SLP: create an SLP data structure for every interleaving group of
2824          stores for further analysis in vect_analyse_slp.  */
2825       if (DR_IS_WRITE (dr) && !slp_impossible)
2826         {
2827           if (loop_vinfo)
2828             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2829           if (bb_vinfo)
2830             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2831         }
2832     }
2833
2834   return true;
2835 }
2836
2837 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2838    accesses of legal size, step, etc.  Detect gaps, single element
2839    interleaving, and other special cases. Set grouped access info.
2840    Collect groups of strided stores for further use in SLP analysis.  */
2841
2842 static bool
2843 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2844 {
2845   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2846     {
2847       /* Dissolve the group if present.  */
2848       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2849       while (stmt_info)
2850         {
2851           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2852           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2853           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2854           stmt_info = next;
2855         }
2856       return false;
2857     }
2858   return true;
2859 }
2860
2861 /* Analyze the access pattern of the data-reference DR_INFO.
2862    In case of non-consecutive accesses call vect_analyze_group_access() to
2863    analyze groups of accesses.  */
2864
2865 static bool
2866 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2867 {
2868   data_reference *dr = dr_info->dr;
2869   tree step = DR_STEP (dr);
2870   tree scalar_type = TREE_TYPE (DR_REF (dr));
2871   stmt_vec_info stmt_info = dr_info->stmt;
2872   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2873   class loop *loop = NULL;
2874
2875   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2876     return true;
2877
2878   if (loop_vinfo)
2879     loop = LOOP_VINFO_LOOP (loop_vinfo);
2880
2881   if (loop_vinfo && !step)
2882     {
2883       if (dump_enabled_p ())
2884         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2885                          "bad data-ref access in loop\n");
2886       return false;
2887     }
2888
2889   /* Allow loads with zero step in inner-loop vectorization.  */
2890   if (loop_vinfo && integer_zerop (step))
2891     {
2892       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2893       if (!nested_in_vect_loop_p (loop, stmt_info))
2894         return DR_IS_READ (dr);
2895       /* Allow references with zero step for outer loops marked
2896          with pragma omp simd only - it guarantees absence of
2897          loop-carried dependencies between inner loop iterations.  */
2898       if (loop->safelen < 2)
2899         {
2900           if (dump_enabled_p ())
2901             dump_printf_loc (MSG_NOTE, vect_location,
2902                              "zero step in inner loop of nest\n");
2903           return false;
2904         }
2905     }
2906
2907   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2908     {
2909       /* Interleaved accesses are not yet supported within outer-loop
2910         vectorization for references in the inner-loop.  */
2911       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2912
2913       /* For the rest of the analysis we use the outer-loop step.  */
2914       step = STMT_VINFO_DR_STEP (stmt_info);
2915       if (integer_zerop (step))
2916         {
2917           if (dump_enabled_p ())
2918             dump_printf_loc (MSG_NOTE, vect_location,
2919                              "zero step in outer loop.\n");
2920           return DR_IS_READ (dr);
2921         }
2922     }
2923
2924   /* Consecutive?  */
2925   if (TREE_CODE (step) == INTEGER_CST)
2926     {
2927       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2928       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2929           || (dr_step < 0
2930               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2931         {
2932           /* Mark that it is not interleaving.  */
2933           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2934           return true;
2935         }
2936     }
2937
2938   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2939     {
2940       if (dump_enabled_p ())
2941         dump_printf_loc (MSG_NOTE, vect_location,
2942                          "grouped access in outer loop.\n");
2943       return false;
2944     }
2945
2946
2947   /* Assume this is a DR handled by non-constant strided load case.  */
2948   if (TREE_CODE (step) != INTEGER_CST)
2949     return (STMT_VINFO_STRIDED_P (stmt_info)
2950             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2951                 || vect_analyze_group_access (vinfo, dr_info)));
2952
2953   /* Not consecutive access - check if it's a part of interleaving group.  */
2954   return vect_analyze_group_access (vinfo, dr_info);
2955 }
2956
2957 /* Compare two data-references DRA and DRB to group them into chunks
2958    suitable for grouping.  */
2959
2960 static int
2961 dr_group_sort_cmp (const void *dra_, const void *drb_)
2962 {
2963   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2964   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2965   data_reference_p dra = dra_info->dr;
2966   data_reference_p drb = drb_info->dr;
2967   int cmp;
2968
2969   /* Stabilize sort.  */
2970   if (dra == drb)
2971     return 0;
2972
2973   /* Different group IDs lead never belong to the same group.  */
2974   if (dra_info->group != drb_info->group)
2975     return dra_info->group < drb_info->group ? -1 : 1;
2976
2977   /* Ordering of DRs according to base.  */
2978   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2979                                DR_BASE_ADDRESS (drb));
2980   if (cmp != 0)
2981     return cmp;
2982
2983   /* And according to DR_OFFSET.  */
2984   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2985   if (cmp != 0)
2986     return cmp;
2987
2988   /* Put reads before writes.  */
2989   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2990     return DR_IS_READ (dra) ? -1 : 1;
2991
2992   /* Then sort after access size.  */
2993   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2994                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2995   if (cmp != 0)
2996     return cmp;
2997
2998   /* And after step.  */
2999   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3000   if (cmp != 0)
3001     return cmp;
3002
3003   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3004   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3005   if (cmp == 0)
3006     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3007   return cmp;
3008 }
3009
3010 /* If OP is the result of a conversion, return the unconverted value,
3011    otherwise return null.  */
3012
3013 static tree
3014 strip_conversion (tree op)
3015 {
3016   if (TREE_CODE (op) != SSA_NAME)
3017     return NULL_TREE;
3018   gimple *stmt = SSA_NAME_DEF_STMT (op);
3019   if (!is_gimple_assign (stmt)
3020       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3021     return NULL_TREE;
3022   return gimple_assign_rhs1 (stmt);
3023 }
3024
3025 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3026    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3027    be grouped in SLP mode.  */
3028
3029 static bool
3030 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3031                    bool allow_slp_p)
3032 {
3033   if (gimple_assign_single_p (stmt1_info->stmt))
3034     return gimple_assign_single_p (stmt2_info->stmt);
3035
3036   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3037   if (call1 && gimple_call_internal_p (call1))
3038     {
3039       /* Check for two masked loads or two masked stores.  */
3040       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3041       if (!call2 || !gimple_call_internal_p (call2))
3042         return false;
3043       internal_fn ifn = gimple_call_internal_fn (call1);
3044       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3045         return false;
3046       if (ifn != gimple_call_internal_fn (call2))
3047         return false;
3048
3049       /* Check that the masks are the same.  Cope with casts of masks,
3050          like those created by build_mask_conversion.  */
3051       tree mask1 = gimple_call_arg (call1, 2);
3052       tree mask2 = gimple_call_arg (call2, 2);
3053       if (!operand_equal_p (mask1, mask2, 0)
3054           && (ifn == IFN_MASK_STORE || !allow_slp_p))
3055         {
3056           mask1 = strip_conversion (mask1);
3057           if (!mask1)
3058             return false;
3059           mask2 = strip_conversion (mask2);
3060           if (!mask2)
3061             return false;
3062           if (!operand_equal_p (mask1, mask2, 0))
3063             return false;
3064         }
3065       return true;
3066     }
3067
3068   return false;
3069 }
3070
3071 /* Function vect_analyze_data_ref_accesses.
3072
3073    Analyze the access pattern of all the data references in the loop.
3074
3075    FORNOW: the only access pattern that is considered vectorizable is a
3076            simple step 1 (consecutive) access.
3077
3078    FORNOW: handle only arrays and pointer accesses.  */
3079
3080 opt_result
3081 vect_analyze_data_ref_accesses (vec_info *vinfo,
3082                                 vec<int> *dataref_groups)
3083 {
3084   unsigned int i;
3085   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3086
3087   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3088
3089   if (datarefs.is_empty ())
3090     return opt_result::success ();
3091
3092   /* Sort the array of datarefs to make building the interleaving chains
3093      linear.  Don't modify the original vector's order, it is needed for
3094      determining what dependencies are reversed.  */
3095   vec<dr_vec_info *> datarefs_copy;
3096   datarefs_copy.create (datarefs.length ());
3097   for (unsigned i = 0; i < datarefs.length (); i++)
3098     {
3099       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3100       /* If the caller computed DR grouping use that, otherwise group by
3101          basic blocks.  */
3102       if (dataref_groups)
3103         dr_info->group = (*dataref_groups)[i];
3104       else
3105         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3106       datarefs_copy.quick_push (dr_info);
3107     }
3108   datarefs_copy.qsort (dr_group_sort_cmp);
3109   hash_set<stmt_vec_info> to_fixup;
3110
3111   /* Build the interleaving chains.  */
3112   for (i = 0; i < datarefs_copy.length () - 1;)
3113     {
3114       dr_vec_info *dr_info_a = datarefs_copy[i];
3115       data_reference_p dra = dr_info_a->dr;
3116       int dra_group_id = dr_info_a->group;
3117       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3118       stmt_vec_info lastinfo = NULL;
3119       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3120           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3121         {
3122           ++i;
3123           continue;
3124         }
3125       for (i = i + 1; i < datarefs_copy.length (); ++i)
3126         {
3127           dr_vec_info *dr_info_b = datarefs_copy[i];
3128           data_reference_p drb = dr_info_b->dr;
3129           int drb_group_id = dr_info_b->group;
3130           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3131           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3132               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3133             break;
3134
3135           /* ???  Imperfect sorting (non-compatible types, non-modulo
3136              accesses, same accesses) can lead to a group to be artificially
3137              split here as we don't just skip over those.  If it really
3138              matters we can push those to a worklist and re-iterate
3139              over them.  The we can just skip ahead to the next DR here.  */
3140
3141           /* DRs in a different DR group should not be put into the same
3142              interleaving group.  */
3143           if (dra_group_id != drb_group_id)
3144             break;
3145
3146           /* Check that the data-refs have same first location (except init)
3147              and they are both either store or load (not load and store,
3148              not masked loads or stores).  */
3149           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3150               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3151                                         DR_BASE_ADDRESS (drb)) != 0
3152               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3153               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3154             break;
3155
3156           /* Check that the data-refs have the same constant size.  */
3157           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3158           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3159           if (!tree_fits_uhwi_p (sza)
3160               || !tree_fits_uhwi_p (szb)
3161               || !tree_int_cst_equal (sza, szb))
3162             break;
3163
3164           /* Check that the data-refs have the same step.  */
3165           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3166             break;
3167
3168           /* Check the types are compatible.
3169              ???  We don't distinguish this during sorting.  */
3170           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3171                                    TREE_TYPE (DR_REF (drb))))
3172             break;
3173
3174           /* Check that the DR_INITs are compile-time constants.  */
3175           if (!tree_fits_shwi_p (DR_INIT (dra))
3176               || !tree_fits_shwi_p (DR_INIT (drb)))
3177             break;
3178
3179           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3180              just hold extra information.  */
3181           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3182               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3183               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3184             break;
3185
3186           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3187           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3188           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3189           HOST_WIDE_INT init_prev
3190             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3191           gcc_assert (init_a <= init_b
3192                       && init_a <= init_prev
3193                       && init_prev <= init_b);
3194
3195           /* Do not place the same access in the interleaving chain twice.  */
3196           if (init_b == init_prev)
3197             {
3198               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3199                           < gimple_uid (DR_STMT (drb)));
3200               /* Simply link in duplicates and fix up the chain below.  */
3201             }
3202           else
3203             {
3204               /* If init_b == init_a + the size of the type * k, we have an
3205                  interleaving, and DRA is accessed before DRB.  */
3206               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3207               if (type_size_a == 0
3208                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3209                       % type_size_a != 0))
3210                 break;
3211
3212               /* If we have a store, the accesses are adjacent.  This splits
3213                  groups into chunks we support (we don't support vectorization
3214                  of stores with gaps).  */
3215               if (!DR_IS_READ (dra)
3216                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3217                       != type_size_a))
3218                 break;
3219
3220               /* If the step (if not zero or non-constant) is smaller than the
3221                  difference between data-refs' inits this splits groups into
3222                  suitable sizes.  */
3223               if (tree_fits_shwi_p (DR_STEP (dra)))
3224                 {
3225                   unsigned HOST_WIDE_INT step
3226                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3227                   if (step != 0
3228                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3229                     break;
3230                 }
3231             }
3232
3233           if (dump_enabled_p ())
3234             dump_printf_loc (MSG_NOTE, vect_location,
3235                              DR_IS_READ (dra)
3236                              ? "Detected interleaving load %T and %T\n"
3237                              : "Detected interleaving store %T and %T\n",
3238                              DR_REF (dra), DR_REF (drb));
3239
3240           /* Link the found element into the group list.  */
3241           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3242             {
3243               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3244               lastinfo = stmtinfo_a;
3245             }
3246           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3247           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3248           lastinfo = stmtinfo_b;
3249
3250           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3251             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3252
3253           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3254             dump_printf_loc (MSG_NOTE, vect_location,
3255                              "Load suitable for SLP vectorization only.\n");
3256
3257           if (init_b == init_prev
3258               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3259               && dump_enabled_p ())
3260             dump_printf_loc (MSG_NOTE, vect_location,
3261                              "Queuing group with duplicate access for fixup\n");
3262         }
3263     }
3264
3265   /* Fixup groups with duplicate entries by splitting it.  */
3266   while (1)
3267     {
3268       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3269       if (!(it != to_fixup.end ()))
3270         break;
3271       stmt_vec_info grp = *it;
3272       to_fixup.remove (grp);
3273
3274       /* Find the earliest duplicate group member.  */
3275       unsigned first_duplicate = -1u;
3276       stmt_vec_info next, g = grp;
3277       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3278         {
3279           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3280                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3281               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3282             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3283           g = next;
3284         }
3285       if (first_duplicate == -1U)
3286         continue;
3287
3288       /* Then move all stmts after the first duplicate to a new group.
3289          Note this is a heuristic but one with the property that *it
3290          is fixed up completely.  */
3291       g = grp;
3292       stmt_vec_info newgroup = NULL, ng = grp;
3293       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3294         {
3295           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3296             {
3297               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3298               if (!newgroup)
3299                 newgroup = next;
3300               else
3301                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3302               ng = next;
3303               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3304             }
3305           else
3306             g = DR_GROUP_NEXT_ELEMENT (g);
3307         }
3308       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3309
3310       /* Fixup the new group which still may contain duplicates.  */
3311       to_fixup.add (newgroup);
3312     }
3313
3314   dr_vec_info *dr_info;
3315   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3316     {
3317       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3318           && !vect_analyze_data_ref_access (vinfo, dr_info))
3319         {
3320           if (dump_enabled_p ())
3321             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3322                              "not vectorized: complicated access pattern.\n");
3323
3324           if (is_a <bb_vec_info> (vinfo))
3325             {
3326               /* Mark the statement as not vectorizable.  */
3327               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3328               continue;
3329             }
3330           else
3331             {
3332               datarefs_copy.release ();
3333               return opt_result::failure_at (dr_info->stmt->stmt,
3334                                              "not vectorized:"
3335                                              " complicated access pattern.\n");
3336             }
3337         }
3338     }
3339
3340   datarefs_copy.release ();
3341   return opt_result::success ();
3342 }
3343
3344 /* Function vect_vfa_segment_size.
3345
3346    Input:
3347      DR_INFO: The data reference.
3348      LENGTH_FACTOR: segment length to consider.
3349
3350    Return a value suitable for the dr_with_seg_len::seg_len field.
3351    This is the "distance travelled" by the pointer from the first
3352    iteration in the segment to the last.  Note that it does not include
3353    the size of the access; in effect it only describes the first byte.  */
3354
3355 static tree
3356 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3357 {
3358   length_factor = size_binop (MINUS_EXPR,
3359                               fold_convert (sizetype, length_factor),
3360                               size_one_node);
3361   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3362                      length_factor);
3363 }
3364
3365 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3366    gives the worst-case number of bytes covered by the segment.  */
3367
3368 static unsigned HOST_WIDE_INT
3369 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3370 {
3371   stmt_vec_info stmt_vinfo = dr_info->stmt;
3372   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3373   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3374   unsigned HOST_WIDE_INT access_size = ref_size;
3375   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3376     {
3377       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3378       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3379     }
3380   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3381   int misalignment;
3382   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3383       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3384       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3385           == dr_explicit_realign_optimized))
3386     {
3387       /* We might access a full vector's worth.  */
3388       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3389     }
3390   return access_size;
3391 }
3392
3393 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3394    describes.  */
3395
3396 static unsigned int
3397 vect_vfa_align (dr_vec_info *dr_info)
3398 {
3399   return dr_alignment (dr_info->dr);
3400 }
3401
3402 /* Function vect_no_alias_p.
3403
3404    Given data references A and B with equal base and offset, see whether
3405    the alias relation can be decided at compilation time.  Return 1 if
3406    it can and the references alias, 0 if it can and the references do
3407    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3408    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3409    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3410
3411 static int
3412 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3413                          tree segment_length_a, tree segment_length_b,
3414                          unsigned HOST_WIDE_INT access_size_a,
3415                          unsigned HOST_WIDE_INT access_size_b)
3416 {
3417   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3418   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3419   poly_uint64 const_length_a;
3420   poly_uint64 const_length_b;
3421
3422   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3423      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3424      [a, a+12) */
3425   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3426     {
3427       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3428       offset_a -= const_length_a;
3429     }
3430   else
3431     const_length_a = tree_to_poly_uint64 (segment_length_a);
3432   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3433     {
3434       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3435       offset_b -= const_length_b;
3436     }
3437   else
3438     const_length_b = tree_to_poly_uint64 (segment_length_b);
3439
3440   const_length_a += access_size_a;
3441   const_length_b += access_size_b;
3442
3443   if (ranges_known_overlap_p (offset_a, const_length_a,
3444                               offset_b, const_length_b))
3445     return 1;
3446
3447   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3448                                offset_b, const_length_b))
3449     return 0;
3450
3451   return -1;
3452 }
3453
3454 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3455    in DDR is >= VF.  */
3456
3457 static bool
3458 dependence_distance_ge_vf (data_dependence_relation *ddr,
3459                            unsigned int loop_depth, poly_uint64 vf)
3460 {
3461   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3462       || DDR_NUM_DIST_VECTS (ddr) == 0)
3463     return false;
3464
3465   /* If the dependence is exact, we should have limited the VF instead.  */
3466   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3467
3468   unsigned int i;
3469   lambda_vector dist_v;
3470   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3471     {
3472       HOST_WIDE_INT dist = dist_v[loop_depth];
3473       if (dist != 0
3474           && !(dist > 0 && DDR_REVERSED_P (ddr))
3475           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3476         return false;
3477     }
3478
3479   if (dump_enabled_p ())
3480     dump_printf_loc (MSG_NOTE, vect_location,
3481                      "dependence distance between %T and %T is >= VF\n",
3482                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3483
3484   return true;
3485 }
3486
3487 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3488
3489 static void
3490 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3491 {
3492   dump_printf (dump_kind, "%s (%T) >= ",
3493                lower_bound.unsigned_p ? "unsigned" : "abs",
3494                lower_bound.expr);
3495   dump_dec (dump_kind, lower_bound.min_value);
3496 }
3497
3498 /* Record that the vectorized loop requires the vec_lower_bound described
3499    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3500
3501 static void
3502 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3503                         poly_uint64 min_value)
3504 {
3505   vec<vec_lower_bound> &lower_bounds
3506     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3507   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3508     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3509       {
3510         unsigned_p &= lower_bounds[i].unsigned_p;
3511         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3512         if (lower_bounds[i].unsigned_p != unsigned_p
3513             || maybe_lt (lower_bounds[i].min_value, min_value))
3514           {
3515             lower_bounds[i].unsigned_p = unsigned_p;
3516             lower_bounds[i].min_value = min_value;
3517             if (dump_enabled_p ())
3518               {
3519                 dump_printf_loc (MSG_NOTE, vect_location,
3520                                  "updating run-time check to ");
3521                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3522                 dump_printf (MSG_NOTE, "\n");
3523               }
3524           }
3525         return;
3526       }
3527
3528   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3529   if (dump_enabled_p ())
3530     {
3531       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3532       dump_lower_bound (MSG_NOTE, lower_bound);
3533       dump_printf (MSG_NOTE, "\n");
3534     }
3535   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3536 }
3537
3538 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3539    will span fewer than GAP bytes.  */
3540
3541 static bool
3542 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3543                   poly_int64 gap)
3544 {
3545   stmt_vec_info stmt_info = dr_info->stmt;
3546   HOST_WIDE_INT count
3547     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3548   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3549     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3550   return (estimated_poly_value (gap)
3551           <= count * vect_get_scalar_dr_size (dr_info));
3552 }
3553
3554 /* Return true if we know that there is no alias between DR_INFO_A and
3555    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3556    When returning true, set *LOWER_BOUND_OUT to this N.  */
3557
3558 static bool
3559 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3560                                 poly_uint64 *lower_bound_out)
3561 {
3562   /* Check that there is a constant gap of known sign between DR_A
3563      and DR_B.  */
3564   data_reference *dr_a = dr_info_a->dr;
3565   data_reference *dr_b = dr_info_b->dr;
3566   poly_int64 init_a, init_b;
3567   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3568       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3569       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3570       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3571       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3572       || !ordered_p (init_a, init_b))
3573     return false;
3574
3575   /* Sort DR_A and DR_B by the address they access.  */
3576   if (maybe_lt (init_b, init_a))
3577     {
3578       std::swap (init_a, init_b);
3579       std::swap (dr_info_a, dr_info_b);
3580       std::swap (dr_a, dr_b);
3581     }
3582
3583   /* If the two accesses could be dependent within a scalar iteration,
3584      make sure that we'd retain their order.  */
3585   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3586       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3587     return false;
3588
3589   /* There is no alias if abs (DR_STEP) is greater than or equal to
3590      the bytes spanned by the combination of the two accesses.  */
3591   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3592   return true;
3593 }
3594
3595 /* Function vect_prune_runtime_alias_test_list.
3596
3597    Prune a list of ddrs to be tested at run-time by versioning for alias.
3598    Merge several alias checks into one if possible.
3599    Return FALSE if resulting list of ddrs is longer then allowed by
3600    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3601
3602 opt_result
3603 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3604 {
3605   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3606   hash_set <tree_pair_hash> compared_objects;
3607
3608   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3609   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3610     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3611   const vec<vec_object_pair> &check_unequal_addrs
3612     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3613   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3614   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3615
3616   ddr_p ddr;
3617   unsigned int i;
3618   tree length_factor;
3619
3620   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3621
3622   /* Step values are irrelevant for aliasing if the number of vector
3623      iterations is equal to the number of scalar iterations (which can
3624      happen for fully-SLP loops).  */
3625   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3626
3627   if (!vf_one_p)
3628     {
3629       /* Convert the checks for nonzero steps into bound tests.  */
3630       tree value;
3631       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3632         vect_check_lower_bound (loop_vinfo, value, true, 1);
3633     }
3634
3635   if (may_alias_ddrs.is_empty ())
3636     return opt_result::success ();
3637
3638   comp_alias_ddrs.create (may_alias_ddrs.length ());
3639
3640   unsigned int loop_depth
3641     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3642                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3643
3644   /* First, we collect all data ref pairs for aliasing checks.  */
3645   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3646     {
3647       poly_uint64 lower_bound;
3648       tree segment_length_a, segment_length_b;
3649       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3650       unsigned int align_a, align_b;
3651
3652       /* Ignore the alias if the VF we chose ended up being no greater
3653          than the dependence distance.  */
3654       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3655         continue;
3656
3657       if (DDR_OBJECT_A (ddr))
3658         {
3659           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3660           if (!compared_objects.add (new_pair))
3661             {
3662               if (dump_enabled_p ())
3663                 dump_printf_loc (MSG_NOTE, vect_location,
3664                                  "checking that %T and %T"
3665                                  " have different addresses\n",
3666                                  new_pair.first, new_pair.second);
3667               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3668             }
3669           continue;
3670         }
3671
3672       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3673       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3674
3675       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3676       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3677
3678       bool preserves_scalar_order_p
3679         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3680       bool ignore_step_p
3681           = (vf_one_p
3682              && (preserves_scalar_order_p
3683                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3684                                      DR_STEP (dr_info_b->dr))));
3685
3686       /* Skip the pair if inter-iteration dependencies are irrelevant
3687          and intra-iteration dependencies are guaranteed to be honored.  */
3688       if (ignore_step_p
3689           && (preserves_scalar_order_p
3690               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3691                                                  &lower_bound)))
3692         {
3693           if (dump_enabled_p ())
3694             dump_printf_loc (MSG_NOTE, vect_location,
3695                              "no need for alias check between "
3696                              "%T and %T when VF is 1\n",
3697                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3698           continue;
3699         }
3700
3701       /* See whether we can handle the alias using a bounds check on
3702          the step, and whether that's likely to be the best approach.
3703          (It might not be, for example, if the minimum step is much larger
3704          than the number of bytes handled by one vector iteration.)  */
3705       if (!ignore_step_p
3706           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3707           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3708                                              &lower_bound)
3709           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3710               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3711         {
3712           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3713           if (dump_enabled_p ())
3714             {
3715               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3716                                "%T and %T when the step %T is outside ",
3717                                DR_REF (dr_info_a->dr),
3718                                DR_REF (dr_info_b->dr),
3719                                DR_STEP (dr_info_a->dr));
3720               if (unsigned_p)
3721                 dump_printf (MSG_NOTE, "[0");
3722               else
3723                 {
3724                   dump_printf (MSG_NOTE, "(");
3725                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3726                 }
3727               dump_printf (MSG_NOTE, ", ");
3728               dump_dec (MSG_NOTE, lower_bound);
3729               dump_printf (MSG_NOTE, ")\n");
3730             }
3731           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3732                                   unsigned_p, lower_bound);
3733           continue;
3734         }
3735
3736       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3737       if (dr_group_first_a)
3738         {
3739           stmt_info_a = dr_group_first_a;
3740           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3741         }
3742
3743       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3744       if (dr_group_first_b)
3745         {
3746           stmt_info_b = dr_group_first_b;
3747           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3748         }
3749
3750       if (ignore_step_p)
3751         {
3752           segment_length_a = size_zero_node;
3753           segment_length_b = size_zero_node;
3754         }
3755       else
3756         {
3757           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3758                                 DR_STEP (dr_info_b->dr), 0))
3759             length_factor = scalar_loop_iters;
3760           else
3761             length_factor = size_int (vect_factor);
3762           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3763           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3764         }
3765       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3766       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3767       align_a = vect_vfa_align (dr_info_a);
3768       align_b = vect_vfa_align (dr_info_b);
3769
3770       /* See whether the alias is known at compilation time.  */
3771       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3772                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3773           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3774                               DR_OFFSET (dr_info_b->dr), 0)
3775           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3776           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3777           && poly_int_tree_p (segment_length_a)
3778           && poly_int_tree_p (segment_length_b))
3779         {
3780           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3781                                              segment_length_a,
3782                                              segment_length_b,
3783                                              access_size_a,
3784                                              access_size_b);
3785           if (res >= 0 && dump_enabled_p ())
3786             {
3787               dump_printf_loc (MSG_NOTE, vect_location,
3788                                "can tell at compile time that %T and %T",
3789                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3790               if (res == 0)
3791                 dump_printf (MSG_NOTE, " do not alias\n");
3792               else
3793                 dump_printf (MSG_NOTE, " alias\n");
3794             }
3795
3796           if (res == 0)
3797             continue;
3798
3799           if (res == 1)
3800             return opt_result::failure_at (stmt_info_b->stmt,
3801                                            "not vectorized:"
3802                                            " compilation time alias: %G%G",
3803                                            stmt_info_a->stmt,
3804                                            stmt_info_b->stmt);
3805         }
3806
3807       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3808                             access_size_a, align_a);
3809       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3810                             access_size_b, align_b);
3811       /* Canonicalize the order to be the one that's needed for accurate
3812          RAW, WAR and WAW flags, in cases where the data references are
3813          well-ordered.  The order doesn't really matter otherwise,
3814          but we might as well be consistent.  */
3815       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3816         std::swap (dr_a, dr_b);
3817
3818       dr_with_seg_len_pair_t dr_with_seg_len_pair
3819         (dr_a, dr_b, (preserves_scalar_order_p
3820                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3821                       : dr_with_seg_len_pair_t::REORDERED));
3822
3823       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3824     }
3825
3826   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3827
3828   unsigned int count = (comp_alias_ddrs.length ()
3829                         + check_unequal_addrs.length ());
3830
3831   if (count
3832       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3833           == VECT_COST_MODEL_VERY_CHEAP))
3834     return opt_result::failure_at
3835       (vect_location, "would need a runtime alias check\n");
3836
3837   if (dump_enabled_p ())
3838     dump_printf_loc (MSG_NOTE, vect_location,
3839                      "improved number of alias checks from %d to %d\n",
3840                      may_alias_ddrs.length (), count);
3841   unsigned limit = param_vect_max_version_for_alias_checks;
3842   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3843     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3844   if (count > limit)
3845     return opt_result::failure_at
3846       (vect_location,
3847        "number of versioning for alias run-time tests exceeds %d "
3848        "(--param vect-max-version-for-alias-checks)\n", limit);
3849
3850   return opt_result::success ();
3851 }
3852
3853 /* Check whether we can use an internal function for a gather load
3854    or scatter store.  READ_P is true for loads and false for stores.
3855    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3856    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3857    is the type of the offset that is being applied to the invariant
3858    base address.  SCALE is the amount by which the offset should
3859    be multiplied *after* it has been converted to address width.
3860
3861    Return true if the function is supported, storing the function id in
3862    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3863
3864 bool
3865 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3866                           tree vectype, tree memory_type, tree offset_type,
3867                           int scale, internal_fn *ifn_out,
3868                           tree *offset_vectype_out)
3869 {
3870   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3871   unsigned int element_bits = vector_element_bits (vectype);
3872   if (element_bits != memory_bits)
3873     /* For now the vector elements must be the same width as the
3874        memory elements.  */
3875     return false;
3876
3877   /* Work out which function we need.  */
3878   internal_fn ifn, alt_ifn;
3879   if (read_p)
3880     {
3881       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3882       alt_ifn = IFN_MASK_GATHER_LOAD;
3883     }
3884   else
3885     {
3886       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3887       alt_ifn = IFN_MASK_SCATTER_STORE;
3888     }
3889
3890   for (;;)
3891     {
3892       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3893       if (!offset_vectype)
3894         return false;
3895
3896       /* Test whether the target supports this combination.  */
3897       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3898                                                   offset_vectype, scale))
3899         {
3900           *ifn_out = ifn;
3901           *offset_vectype_out = offset_vectype;
3902           return true;
3903         }
3904       else if (!masked_p
3905                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3906                                                           memory_type,
3907                                                           offset_vectype,
3908                                                           scale))
3909         {
3910           *ifn_out = alt_ifn;
3911           *offset_vectype_out = offset_vectype;
3912           return true;
3913         }
3914
3915       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3916           && TYPE_PRECISION (offset_type) >= element_bits)
3917         return false;
3918
3919       offset_type = build_nonstandard_integer_type
3920         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3921     }
3922 }
3923
3924 /* STMT_INFO is a call to an internal gather load or scatter store function.
3925    Describe the operation in INFO.  */
3926
3927 static void
3928 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3929                                    gather_scatter_info *info)
3930 {
3931   gcall *call = as_a <gcall *> (stmt_info->stmt);
3932   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3933   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3934
3935   info->ifn = gimple_call_internal_fn (call);
3936   info->decl = NULL_TREE;
3937   info->base = gimple_call_arg (call, 0);
3938   info->offset = gimple_call_arg (call, 1);
3939   info->offset_dt = vect_unknown_def_type;
3940   info->offset_vectype = NULL_TREE;
3941   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3942   info->element_type = TREE_TYPE (vectype);
3943   info->memory_type = TREE_TYPE (DR_REF (dr));
3944 }
3945
3946 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3947    gather load or scatter store.  Describe the operation in *INFO if so.  */
3948
3949 bool
3950 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3951                            gather_scatter_info *info)
3952 {
3953   HOST_WIDE_INT scale = 1;
3954   poly_int64 pbitpos, pbitsize;
3955   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3956   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3957   tree offtype = NULL_TREE;
3958   tree decl = NULL_TREE, base, off;
3959   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3960   tree memory_type = TREE_TYPE (DR_REF (dr));
3961   machine_mode pmode;
3962   int punsignedp, reversep, pvolatilep = 0;
3963   internal_fn ifn;
3964   tree offset_vectype;
3965   bool masked_p = false;
3966
3967   /* See whether this is already a call to a gather/scatter internal function.
3968      If not, see whether it's a masked load or store.  */
3969   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3970   if (call && gimple_call_internal_p (call))
3971     {
3972       ifn = gimple_call_internal_fn (call);
3973       if (internal_gather_scatter_fn_p (ifn))
3974         {
3975           vect_describe_gather_scatter_call (stmt_info, info);
3976           return true;
3977         }
3978       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3979     }
3980
3981   /* True if we should aim to use internal functions rather than
3982      built-in functions.  */
3983   bool use_ifn_p = (DR_IS_READ (dr)
3984                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3985                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
3986
3987   base = DR_REF (dr);
3988   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3989      see if we can use the def stmt of the address.  */
3990   if (masked_p
3991       && TREE_CODE (base) == MEM_REF
3992       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3993       && integer_zerop (TREE_OPERAND (base, 1))
3994       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3995     {
3996       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3997       if (is_gimple_assign (def_stmt)
3998           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3999         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4000     }
4001
4002   /* The gather and scatter builtins need address of the form
4003      loop_invariant + vector * {1, 2, 4, 8}
4004      or
4005      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4006      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4007      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4008      multiplications and additions in it.  To get a vector, we need
4009      a single SSA_NAME that will be defined in the loop and will
4010      contain everything that is not loop invariant and that can be
4011      vectorized.  The following code attempts to find such a preexistng
4012      SSA_NAME OFF and put the loop invariants into a tree BASE
4013      that can be gimplified before the loop.  */
4014   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4015                               &punsignedp, &reversep, &pvolatilep);
4016   if (reversep)
4017     return false;
4018
4019   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4020
4021   if (TREE_CODE (base) == MEM_REF)
4022     {
4023       if (!integer_zerop (TREE_OPERAND (base, 1)))
4024         {
4025           if (off == NULL_TREE)
4026             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4027           else
4028             off = size_binop (PLUS_EXPR, off,
4029                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4030         }
4031       base = TREE_OPERAND (base, 0);
4032     }
4033   else
4034     base = build_fold_addr_expr (base);
4035
4036   if (off == NULL_TREE)
4037     off = size_zero_node;
4038
4039   /* If base is not loop invariant, either off is 0, then we start with just
4040      the constant offset in the loop invariant BASE and continue with base
4041      as OFF, otherwise give up.
4042      We could handle that case by gimplifying the addition of base + off
4043      into some SSA_NAME and use that as off, but for now punt.  */
4044   if (!expr_invariant_in_loop_p (loop, base))
4045     {
4046       if (!integer_zerop (off))
4047         return false;
4048       off = base;
4049       base = size_int (pbytepos);
4050     }
4051   /* Otherwise put base + constant offset into the loop invariant BASE
4052      and continue with OFF.  */
4053   else
4054     {
4055       base = fold_convert (sizetype, base);
4056       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4057     }
4058
4059   /* OFF at this point may be either a SSA_NAME or some tree expression
4060      from get_inner_reference.  Try to peel off loop invariants from it
4061      into BASE as long as possible.  */
4062   STRIP_NOPS (off);
4063   while (offtype == NULL_TREE)
4064     {
4065       enum tree_code code;
4066       tree op0, op1, add = NULL_TREE;
4067
4068       if (TREE_CODE (off) == SSA_NAME)
4069         {
4070           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4071
4072           if (expr_invariant_in_loop_p (loop, off))
4073             return false;
4074
4075           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4076             break;
4077
4078           op0 = gimple_assign_rhs1 (def_stmt);
4079           code = gimple_assign_rhs_code (def_stmt);
4080           op1 = gimple_assign_rhs2 (def_stmt);
4081         }
4082       else
4083         {
4084           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4085             return false;
4086           code = TREE_CODE (off);
4087           extract_ops_from_tree (off, &code, &op0, &op1);
4088         }
4089       switch (code)
4090         {
4091         case POINTER_PLUS_EXPR:
4092         case PLUS_EXPR:
4093           if (expr_invariant_in_loop_p (loop, op0))
4094             {
4095               add = op0;
4096               off = op1;
4097             do_add:
4098               add = fold_convert (sizetype, add);
4099               if (scale != 1)
4100                 add = size_binop (MULT_EXPR, add, size_int (scale));
4101               base = size_binop (PLUS_EXPR, base, add);
4102               continue;
4103             }
4104           if (expr_invariant_in_loop_p (loop, op1))
4105             {
4106               add = op1;
4107               off = op0;
4108               goto do_add;
4109             }
4110           break;
4111         case MINUS_EXPR:
4112           if (expr_invariant_in_loop_p (loop, op1))
4113             {
4114               add = fold_convert (sizetype, op1);
4115               add = size_binop (MINUS_EXPR, size_zero_node, add);
4116               off = op0;
4117               goto do_add;
4118             }
4119           break;
4120         case MULT_EXPR:
4121           if (scale == 1 && tree_fits_shwi_p (op1))
4122             {
4123               int new_scale = tree_to_shwi (op1);
4124               /* Only treat this as a scaling operation if the target
4125                  supports it for at least some offset type.  */
4126               if (use_ifn_p
4127                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4128                                                 masked_p, vectype, memory_type,
4129                                                 signed_char_type_node,
4130                                                 new_scale, &ifn,
4131                                                 &offset_vectype)
4132                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4133                                                 masked_p, vectype, memory_type,
4134                                                 unsigned_char_type_node,
4135                                                 new_scale, &ifn,
4136                                                 &offset_vectype))
4137                 break;
4138               scale = new_scale;
4139               off = op0;
4140               continue;
4141             }
4142           break;
4143         case SSA_NAME:
4144           off = op0;
4145           continue;
4146         CASE_CONVERT:
4147           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4148               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4149             break;
4150
4151           /* Don't include the conversion if the target is happy with
4152              the current offset type.  */
4153           if (use_ifn_p
4154               && TREE_CODE (off) == SSA_NAME
4155               && !POINTER_TYPE_P (TREE_TYPE (off))
4156               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4157                                            masked_p, vectype, memory_type,
4158                                            TREE_TYPE (off), scale, &ifn,
4159                                            &offset_vectype))
4160             break;
4161
4162           if (TYPE_PRECISION (TREE_TYPE (op0))
4163               == TYPE_PRECISION (TREE_TYPE (off)))
4164             {
4165               off = op0;
4166               continue;
4167             }
4168
4169           /* Include the conversion if it is widening and we're using
4170              the IFN path or the target can handle the converted from
4171              offset or the current size is not already the same as the
4172              data vector element size.  */
4173           if ((TYPE_PRECISION (TREE_TYPE (op0))
4174                < TYPE_PRECISION (TREE_TYPE (off)))
4175               && (use_ifn_p
4176                   || (DR_IS_READ (dr)
4177                       ? (targetm.vectorize.builtin_gather
4178                          && targetm.vectorize.builtin_gather (vectype,
4179                                                               TREE_TYPE (op0),
4180                                                               scale))
4181                       : (targetm.vectorize.builtin_scatter
4182                          && targetm.vectorize.builtin_scatter (vectype,
4183                                                                TREE_TYPE (op0),
4184                                                                scale)))
4185                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4186                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4187             {
4188               off = op0;
4189               offtype = TREE_TYPE (off);
4190               STRIP_NOPS (off);
4191               continue;
4192             }
4193           break;
4194         default:
4195           break;
4196         }
4197       break;
4198     }
4199
4200   /* If at the end OFF still isn't a SSA_NAME or isn't
4201      defined in the loop, punt.  */
4202   if (TREE_CODE (off) != SSA_NAME
4203       || expr_invariant_in_loop_p (loop, off))
4204     return false;
4205
4206   if (offtype == NULL_TREE)
4207     offtype = TREE_TYPE (off);
4208
4209   if (use_ifn_p)
4210     {
4211       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4212                                      vectype, memory_type, offtype, scale,
4213                                      &ifn, &offset_vectype))
4214         ifn = IFN_LAST;
4215       decl = NULL_TREE;
4216     }
4217   else
4218     {
4219       if (DR_IS_READ (dr))
4220         {
4221           if (targetm.vectorize.builtin_gather)
4222             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4223         }
4224       else
4225         {
4226           if (targetm.vectorize.builtin_scatter)
4227             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4228         }
4229       ifn = IFN_LAST;
4230       /* The offset vector type will be read from DECL when needed.  */
4231       offset_vectype = NULL_TREE;
4232     }
4233
4234   info->ifn = ifn;
4235   info->decl = decl;
4236   info->base = base;
4237   info->offset = off;
4238   info->offset_dt = vect_unknown_def_type;
4239   info->offset_vectype = offset_vectype;
4240   info->scale = scale;
4241   info->element_type = TREE_TYPE (vectype);
4242   info->memory_type = memory_type;
4243   return true;
4244 }
4245
4246 /* Find the data references in STMT, analyze them with respect to LOOP and
4247    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4248    be handled.  */
4249
4250 opt_result
4251 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4252                                vec<data_reference_p> *datarefs,
4253                                vec<int> *dataref_groups, int group_id)
4254 {
4255   /* We can ignore clobbers for dataref analysis - they are removed during
4256      loop vectorization and BB vectorization checks dependences with a
4257      stmt walk.  */
4258   if (gimple_clobber_p (stmt))
4259     return opt_result::success ();
4260
4261   if (gimple_has_volatile_ops (stmt))
4262     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4263                                    stmt);
4264
4265   if (stmt_can_throw_internal (cfun, stmt))
4266     return opt_result::failure_at (stmt,
4267                                    "not vectorized:"
4268                                    " statement can throw an exception: %G",
4269                                    stmt);
4270
4271   auto_vec<data_reference_p, 2> refs;
4272   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4273   if (!res)
4274     return res;
4275
4276   if (refs.is_empty ())
4277     return opt_result::success ();
4278
4279   if (refs.length () > 1)
4280     {
4281       while (!refs.is_empty ())
4282         free_data_ref (refs.pop ());
4283       return opt_result::failure_at (stmt,
4284                                      "not vectorized: more than one "
4285                                      "data ref in stmt: %G", stmt);
4286     }
4287
4288   data_reference_p dr = refs.pop ();
4289   if (gcall *call = dyn_cast <gcall *> (stmt))
4290     if (!gimple_call_internal_p (call)
4291         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4292             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4293       {
4294         free_data_ref (dr);
4295         return opt_result::failure_at (stmt,
4296                                        "not vectorized: dr in a call %G", stmt);
4297       }
4298
4299   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4300       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4301     {
4302       free_data_ref (dr);
4303       return opt_result::failure_at (stmt,
4304                                      "not vectorized:"
4305                                      " statement is bitfield access %G", stmt);
4306     }
4307
4308   if (DR_BASE_ADDRESS (dr)
4309       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4310     {
4311       free_data_ref (dr);
4312       return opt_result::failure_at (stmt,
4313                                      "not vectorized:"
4314                                      " base addr of dr is a constant\n");
4315     }
4316
4317   /* Check whether this may be a SIMD lane access and adjust the
4318      DR to make it easier for us to handle it.  */
4319   if (loop
4320       && loop->simduid
4321       && (!DR_BASE_ADDRESS (dr)
4322           || !DR_OFFSET (dr)
4323           || !DR_INIT (dr)
4324           || !DR_STEP (dr)))
4325     {
4326       struct data_reference *newdr
4327         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4328                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4329       if (DR_BASE_ADDRESS (newdr)
4330           && DR_OFFSET (newdr)
4331           && DR_INIT (newdr)
4332           && DR_STEP (newdr)
4333           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4334           && integer_zerop (DR_STEP (newdr)))
4335         {
4336           tree base_address = DR_BASE_ADDRESS (newdr);
4337           tree off = DR_OFFSET (newdr);
4338           tree step = ssize_int (1);
4339           if (integer_zerop (off)
4340               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4341             {
4342               off = TREE_OPERAND (base_address, 1);
4343               base_address = TREE_OPERAND (base_address, 0);
4344             }
4345           STRIP_NOPS (off);
4346           if (TREE_CODE (off) == MULT_EXPR
4347               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4348             {
4349               step = TREE_OPERAND (off, 1);
4350               off = TREE_OPERAND (off, 0);
4351               STRIP_NOPS (off);
4352             }
4353           if (CONVERT_EXPR_P (off)
4354               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4355                   < TYPE_PRECISION (TREE_TYPE (off))))
4356             off = TREE_OPERAND (off, 0);
4357           if (TREE_CODE (off) == SSA_NAME)
4358             {
4359               gimple *def = SSA_NAME_DEF_STMT (off);
4360               /* Look through widening conversion.  */
4361               if (is_gimple_assign (def)
4362                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4363                 {
4364                   tree rhs1 = gimple_assign_rhs1 (def);
4365                   if (TREE_CODE (rhs1) == SSA_NAME
4366                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4367                       && (TYPE_PRECISION (TREE_TYPE (off))
4368                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4369                     def = SSA_NAME_DEF_STMT (rhs1);
4370                 }
4371               if (is_gimple_call (def)
4372                   && gimple_call_internal_p (def)
4373                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4374                 {
4375                   tree arg = gimple_call_arg (def, 0);
4376                   tree reft = TREE_TYPE (DR_REF (newdr));
4377                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4378                   arg = SSA_NAME_VAR (arg);
4379                   if (arg == loop->simduid
4380                       /* For now.  */
4381                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4382                     {
4383                       DR_BASE_ADDRESS (newdr) = base_address;
4384                       DR_OFFSET (newdr) = ssize_int (0);
4385                       DR_STEP (newdr) = step;
4386                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4387                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4388                       /* Mark as simd-lane access.  */
4389                       tree arg2 = gimple_call_arg (def, 1);
4390                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4391                       free_data_ref (dr);
4392                       datarefs->safe_push (newdr);
4393                       if (dataref_groups)
4394                         dataref_groups->safe_push (group_id);
4395                       return opt_result::success ();
4396                     }
4397                 }
4398             }
4399         }
4400       free_data_ref (newdr);
4401     }
4402
4403   datarefs->safe_push (dr);
4404   if (dataref_groups)
4405     dataref_groups->safe_push (group_id);
4406   return opt_result::success ();
4407 }
4408
4409 /* Function vect_analyze_data_refs.
4410
4411   Find all the data references in the loop or basic block.
4412
4413    The general structure of the analysis of data refs in the vectorizer is as
4414    follows:
4415    1- vect_analyze_data_refs(loop/bb): call
4416       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4417       in the loop/bb and their dependences.
4418    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4419    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4420    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4421
4422 */
4423
4424 opt_result
4425 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4426 {
4427   class loop *loop = NULL;
4428   unsigned int i;
4429   struct data_reference *dr;
4430   tree scalar_type;
4431
4432   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4433
4434   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4435     loop = LOOP_VINFO_LOOP (loop_vinfo);
4436
4437   /* Go through the data-refs, check that the analysis succeeded.  Update
4438      pointer from stmt_vec_info struct to DR and vectype.  */
4439
4440   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4441   FOR_EACH_VEC_ELT (datarefs, i, dr)
4442     {
4443       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4444       poly_uint64 vf;
4445
4446       gcc_assert (DR_REF (dr));
4447       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4448       gcc_assert (!stmt_info->dr_aux.dr);
4449       stmt_info->dr_aux.dr = dr;
4450       stmt_info->dr_aux.stmt = stmt_info;
4451
4452       /* Check that analysis of the data-ref succeeded.  */
4453       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4454           || !DR_STEP (dr))
4455         {
4456           bool maybe_gather
4457             = DR_IS_READ (dr)
4458               && !TREE_THIS_VOLATILE (DR_REF (dr));
4459           bool maybe_scatter
4460             = DR_IS_WRITE (dr)
4461               && !TREE_THIS_VOLATILE (DR_REF (dr))
4462               && (targetm.vectorize.builtin_scatter != NULL
4463                   || supports_vec_scatter_store_p ());
4464
4465           /* If target supports vector gather loads or scatter stores,
4466              see if they can't be used.  */
4467           if (is_a <loop_vec_info> (vinfo)
4468               && !nested_in_vect_loop_p (loop, stmt_info))
4469             {
4470               if (maybe_gather || maybe_scatter)
4471                 {
4472                   if (maybe_gather)
4473                     gatherscatter = GATHER;
4474                   else
4475                     gatherscatter = SCATTER;
4476                 }
4477             }
4478
4479           if (gatherscatter == SG_NONE)
4480             {
4481               if (dump_enabled_p ())
4482                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4483                                  "not vectorized: data ref analysis "
4484                                  "failed %G", stmt_info->stmt);
4485               if (is_a <bb_vec_info> (vinfo))
4486                 {
4487                   /* In BB vectorization the ref can still participate
4488                      in dependence analysis, we just can't vectorize it.  */
4489                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4490                   continue;
4491                 }
4492               return opt_result::failure_at (stmt_info->stmt,
4493                                              "not vectorized:"
4494                                              " data ref analysis failed: %G",
4495                                              stmt_info->stmt);
4496             }
4497         }
4498
4499       /* See if this was detected as SIMD lane access.  */
4500       if (dr->aux == (void *)-1
4501           || dr->aux == (void *)-2
4502           || dr->aux == (void *)-3
4503           || dr->aux == (void *)-4)
4504         {
4505           if (nested_in_vect_loop_p (loop, stmt_info))
4506             return opt_result::failure_at (stmt_info->stmt,
4507                                            "not vectorized:"
4508                                            " data ref analysis failed: %G",
4509                                            stmt_info->stmt);
4510           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4511             = -(uintptr_t) dr->aux;
4512         }
4513
4514       tree base = get_base_address (DR_REF (dr));
4515       if (base && VAR_P (base) && DECL_NONALIASED (base))
4516         {
4517           if (dump_enabled_p ())
4518             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4519                              "not vectorized: base object not addressable "
4520                              "for stmt: %G", stmt_info->stmt);
4521           if (is_a <bb_vec_info> (vinfo))
4522             {
4523               /* In BB vectorization the ref can still participate
4524                  in dependence analysis, we just can't vectorize it.  */
4525               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4526               continue;
4527             }
4528           return opt_result::failure_at (stmt_info->stmt,
4529                                          "not vectorized: base object not"
4530                                          " addressable for stmt: %G",
4531                                          stmt_info->stmt);
4532         }
4533
4534       if (is_a <loop_vec_info> (vinfo)
4535           && DR_STEP (dr)
4536           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4537         {
4538           if (nested_in_vect_loop_p (loop, stmt_info))
4539             return opt_result::failure_at (stmt_info->stmt,
4540                                            "not vectorized: "
4541                                            "not suitable for strided load %G",
4542                                            stmt_info->stmt);
4543           STMT_VINFO_STRIDED_P (stmt_info) = true;
4544         }
4545
4546       /* Update DR field in stmt_vec_info struct.  */
4547
4548       /* If the dataref is in an inner-loop of the loop that is considered for
4549          for vectorization, we also want to analyze the access relative to
4550          the outer-loop (DR contains information only relative to the
4551          inner-most enclosing loop).  We do that by building a reference to the
4552          first location accessed by the inner-loop, and analyze it relative to
4553          the outer-loop.  */
4554       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4555         {
4556           /* Build a reference to the first location accessed by the
4557              inner loop: *(BASE + INIT + OFFSET).  By construction,
4558              this address must be invariant in the inner loop, so we
4559              can consider it as being used in the outer loop.  */
4560           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4561           tree offset = unshare_expr (DR_OFFSET (dr));
4562           tree init = unshare_expr (DR_INIT (dr));
4563           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4564                                           init, offset);
4565           tree init_addr = fold_build_pointer_plus (base, init_offset);
4566           tree init_ref = build_fold_indirect_ref (init_addr);
4567
4568           if (dump_enabled_p ())
4569             dump_printf_loc (MSG_NOTE, vect_location,
4570                              "analyze in outer loop: %T\n", init_ref);
4571
4572           opt_result res
4573             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4574                                     init_ref, loop, stmt_info->stmt);
4575           if (!res)
4576             /* dr_analyze_innermost already explained the failure.  */
4577             return res;
4578
4579           if (dump_enabled_p ())
4580             dump_printf_loc (MSG_NOTE, vect_location,
4581                              "\touter base_address: %T\n"
4582                              "\touter offset from base address: %T\n"
4583                              "\touter constant offset from base address: %T\n"
4584                              "\touter step: %T\n"
4585                              "\touter base alignment: %d\n\n"
4586                              "\touter base misalignment: %d\n"
4587                              "\touter offset alignment: %d\n"
4588                              "\touter step alignment: %d\n",
4589                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4590                              STMT_VINFO_DR_OFFSET (stmt_info),
4591                              STMT_VINFO_DR_INIT (stmt_info),
4592                              STMT_VINFO_DR_STEP (stmt_info),
4593                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4594                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4595                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4596                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4597         }
4598
4599       /* Set vectype for STMT.  */
4600       scalar_type = TREE_TYPE (DR_REF (dr));
4601       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4602       if (!vectype)
4603         {
4604           if (dump_enabled_p ())
4605             {
4606               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4607                                "not vectorized: no vectype for stmt: %G",
4608                                stmt_info->stmt);
4609               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4610               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4611                                  scalar_type);
4612               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4613             }
4614
4615           if (is_a <bb_vec_info> (vinfo))
4616             {
4617               /* No vector type is fine, the ref can still participate
4618                  in dependence analysis, we just can't vectorize it.  */
4619               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4620               continue;
4621             }
4622           if (fatal)
4623             *fatal = false;
4624           return opt_result::failure_at (stmt_info->stmt,
4625                                          "not vectorized:"
4626                                          " no vectype for stmt: %G"
4627                                          " scalar_type: %T\n",
4628                                          stmt_info->stmt, scalar_type);
4629         }
4630       else
4631         {
4632           if (dump_enabled_p ())
4633             dump_printf_loc (MSG_NOTE, vect_location,
4634                              "got vectype for stmt: %G%T\n",
4635                              stmt_info->stmt, vectype);
4636         }
4637
4638       /* Adjust the minimal vectorization factor according to the
4639          vector type.  */
4640       vf = TYPE_VECTOR_SUBPARTS (vectype);
4641       *min_vf = upper_bound (*min_vf, vf);
4642
4643       /* Leave the BB vectorizer to pick the vector type later, based on
4644          the final dataref group size and SLP node size.  */
4645       if (is_a <loop_vec_info> (vinfo))
4646         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4647
4648       if (gatherscatter != SG_NONE)
4649         {
4650           gather_scatter_info gs_info;
4651           if (!vect_check_gather_scatter (stmt_info,
4652                                           as_a <loop_vec_info> (vinfo),
4653                                           &gs_info)
4654               || !get_vectype_for_scalar_type (vinfo,
4655                                                TREE_TYPE (gs_info.offset)))
4656             {
4657               if (fatal)
4658                 *fatal = false;
4659               return opt_result::failure_at
4660                         (stmt_info->stmt,
4661                          (gatherscatter == GATHER)
4662                          ? "not vectorized: not suitable for gather load %G"
4663                          : "not vectorized: not suitable for scatter store %G",
4664                          stmt_info->stmt);
4665             }
4666           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4667         }
4668     }
4669
4670   /* We used to stop processing and prune the list here.  Verify we no
4671      longer need to.  */
4672   gcc_assert (i == datarefs.length ());
4673
4674   return opt_result::success ();
4675 }
4676
4677
4678 /* Function vect_get_new_vect_var.
4679
4680    Returns a name for a new variable.  The current naming scheme appends the
4681    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4682    the name of vectorizer generated variables, and appends that to NAME if
4683    provided.  */
4684
4685 tree
4686 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4687 {
4688   const char *prefix;
4689   tree new_vect_var;
4690
4691   switch (var_kind)
4692   {
4693   case vect_simple_var:
4694     prefix = "vect";
4695     break;
4696   case vect_scalar_var:
4697     prefix = "stmp";
4698     break;
4699   case vect_mask_var:
4700     prefix = "mask";
4701     break;
4702   case vect_pointer_var:
4703     prefix = "vectp";
4704     break;
4705   default:
4706     gcc_unreachable ();
4707   }
4708
4709   if (name)
4710     {
4711       char* tmp = concat (prefix, "_", name, NULL);
4712       new_vect_var = create_tmp_reg (type, tmp);
4713       free (tmp);
4714     }
4715   else
4716     new_vect_var = create_tmp_reg (type, prefix);
4717
4718   return new_vect_var;
4719 }
4720
4721 /* Like vect_get_new_vect_var but return an SSA name.  */
4722
4723 tree
4724 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4725 {
4726   const char *prefix;
4727   tree new_vect_var;
4728
4729   switch (var_kind)
4730   {
4731   case vect_simple_var:
4732     prefix = "vect";
4733     break;
4734   case vect_scalar_var:
4735     prefix = "stmp";
4736     break;
4737   case vect_pointer_var:
4738     prefix = "vectp";
4739     break;
4740   default:
4741     gcc_unreachable ();
4742   }
4743
4744   if (name)
4745     {
4746       char* tmp = concat (prefix, "_", name, NULL);
4747       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4748       free (tmp);
4749     }
4750   else
4751     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4752
4753   return new_vect_var;
4754 }
4755
4756 /* Duplicate points-to info on NAME from DR_INFO.  */
4757
4758 static void
4759 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4760 {
4761   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4762   /* DR_PTR_INFO is for a base SSA name, not including constant or
4763      variable offsets in the ref so its alignment info does not apply.  */
4764   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4765 }
4766
4767 /* Function vect_create_addr_base_for_vector_ref.
4768
4769    Create an expression that computes the address of the first memory location
4770    that will be accessed for a data reference.
4771
4772    Input:
4773    STMT_INFO: The statement containing the data reference.
4774    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4775    OFFSET: Optional. If supplied, it is be added to the initial address.
4776    LOOP:    Specify relative to which loop-nest should the address be computed.
4777             For example, when the dataref is in an inner-loop nested in an
4778             outer-loop that is now being vectorized, LOOP can be either the
4779             outer-loop, or the inner-loop.  The first memory location accessed
4780             by the following dataref ('in' points to short):
4781
4782                 for (i=0; i<N; i++)
4783                    for (j=0; j<M; j++)
4784                      s += in[i+j]
4785
4786             is as follows:
4787             if LOOP=i_loop:     &in             (relative to i_loop)
4788             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4789
4790    Output:
4791    1. Return an SSA_NAME whose value is the address of the memory location of
4792       the first vector of the data reference.
4793    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4794       these statement(s) which define the returned SSA_NAME.
4795
4796    FORNOW: We are only handling array accesses with step 1.  */
4797
4798 tree
4799 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4800                                       gimple_seq *new_stmt_list,
4801                                       tree offset)
4802 {
4803   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4804   struct data_reference *dr = dr_info->dr;
4805   const char *base_name;
4806   tree addr_base;
4807   tree dest;
4808   gimple_seq seq = NULL;
4809   tree vect_ptr_type;
4810   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4811   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4812
4813   tree data_ref_base = unshare_expr (drb->base_address);
4814   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4815   tree init = unshare_expr (drb->init);
4816
4817   if (loop_vinfo)
4818     base_name = get_name (data_ref_base);
4819   else
4820     {
4821       base_offset = ssize_int (0);
4822       init = ssize_int (0);
4823       base_name = get_name (DR_REF (dr));
4824     }
4825
4826   /* Create base_offset */
4827   base_offset = size_binop (PLUS_EXPR,
4828                             fold_convert (sizetype, base_offset),
4829                             fold_convert (sizetype, init));
4830
4831   if (offset)
4832     {
4833       offset = fold_convert (sizetype, offset);
4834       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4835                                  base_offset, offset);
4836     }
4837
4838   /* base + base_offset */
4839   if (loop_vinfo)
4840     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4841   else
4842     {
4843       addr_base = build1 (ADDR_EXPR,
4844                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4845                           unshare_expr (DR_REF (dr)));
4846     }
4847
4848   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4849   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4850   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4851   gimple_seq_add_seq (new_stmt_list, seq);
4852
4853   if (DR_PTR_INFO (dr)
4854       && TREE_CODE (addr_base) == SSA_NAME
4855       /* We should only duplicate pointer info to newly created SSA names.  */
4856       && SSA_NAME_VAR (addr_base) == dest)
4857     {
4858       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4859       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4860     }
4861
4862   if (dump_enabled_p ())
4863     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4864
4865   return addr_base;
4866 }
4867
4868
4869 /* Function vect_create_data_ref_ptr.
4870
4871    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4872    location accessed in the loop by STMT_INFO, along with the def-use update
4873    chain to appropriately advance the pointer through the loop iterations.
4874    Also set aliasing information for the pointer.  This pointer is used by
4875    the callers to this function to create a memory reference expression for
4876    vector load/store access.
4877
4878    Input:
4879    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4880          GIMPLE_ASSIGN <name, data-ref> or
4881          GIMPLE_ASSIGN <data-ref, name>.
4882    2. AGGR_TYPE: the type of the reference, which should be either a vector
4883         or an array.
4884    3. AT_LOOP: the loop where the vector memref is to be created.
4885    4. OFFSET (optional): a byte offset to be added to the initial address
4886         accessed by the data-ref in STMT_INFO.
4887    5. BSI: location where the new stmts are to be placed if there is no loop
4888    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4889         pointing to the initial address.
4890    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4891         to the IV during each iteration of the loop.  NULL says to move
4892         by one copy of AGGR_TYPE up or down, depending on the step of the
4893         data reference.
4894
4895    Output:
4896    1. Declare a new ptr to vector_type, and have it point to the base of the
4897       data reference (initial addressed accessed by the data reference).
4898       For example, for vector of type V8HI, the following code is generated:
4899
4900       v8hi *ap;
4901       ap = (v8hi *)initial_address;
4902
4903       if OFFSET is not supplied:
4904          initial_address = &a[init];
4905       if OFFSET is supplied:
4906          initial_address = &a[init] + OFFSET;
4907       if BYTE_OFFSET is supplied:
4908          initial_address = &a[init] + BYTE_OFFSET;
4909
4910       Return the initial_address in INITIAL_ADDRESS.
4911
4912    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4913       update the pointer in each iteration of the loop.
4914
4915       Return the increment stmt that updates the pointer in PTR_INCR.
4916
4917    3. Return the pointer.  */
4918
4919 tree
4920 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4921                           tree aggr_type, class loop *at_loop, tree offset,
4922                           tree *initial_address, gimple_stmt_iterator *gsi,
4923                           gimple **ptr_incr, bool only_init,
4924                           tree iv_step)
4925 {
4926   const char *base_name;
4927   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4928   class loop *loop = NULL;
4929   bool nested_in_vect_loop = false;
4930   class loop *containing_loop = NULL;
4931   tree aggr_ptr_type;
4932   tree aggr_ptr;
4933   tree new_temp;
4934   gimple_seq new_stmt_list = NULL;
4935   edge pe = NULL;
4936   basic_block new_bb;
4937   tree aggr_ptr_init;
4938   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4939   struct data_reference *dr = dr_info->dr;
4940   tree aptr;
4941   gimple_stmt_iterator incr_gsi;
4942   bool insert_after;
4943   tree indx_before_incr, indx_after_incr;
4944   gimple *incr;
4945   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4946
4947   gcc_assert (iv_step != NULL_TREE
4948               || TREE_CODE (aggr_type) == ARRAY_TYPE
4949               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4950
4951   if (loop_vinfo)
4952     {
4953       loop = LOOP_VINFO_LOOP (loop_vinfo);
4954       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4955       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4956       pe = loop_preheader_edge (loop);
4957     }
4958   else
4959     {
4960       gcc_assert (bb_vinfo);
4961       only_init = true;
4962       *ptr_incr = NULL;
4963     }
4964
4965   /* Create an expression for the first address accessed by this load
4966      in LOOP.  */
4967   base_name = get_name (DR_BASE_ADDRESS (dr));
4968
4969   if (dump_enabled_p ())
4970     {
4971       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4972       dump_printf_loc (MSG_NOTE, vect_location,
4973                        "create %s-pointer variable to type: %T",
4974                        get_tree_code_name (TREE_CODE (aggr_type)),
4975                        aggr_type);
4976       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4977         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4978       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4979         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4980       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4981         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4982       else
4983         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4984       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4985     }
4986
4987   /* (1) Create the new aggregate-pointer variable.
4988      Vector and array types inherit the alias set of their component
4989      type by default so we need to use a ref-all pointer if the data
4990      reference does not conflict with the created aggregated data
4991      reference because it is not addressable.  */
4992   bool need_ref_all = false;
4993   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4994                               get_alias_set (DR_REF (dr))))
4995     need_ref_all = true;
4996   /* Likewise for any of the data references in the stmt group.  */
4997   else if (DR_GROUP_SIZE (stmt_info) > 1)
4998     {
4999       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5000       do
5001         {
5002           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5003           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5004                                       get_alias_set (DR_REF (sdr))))
5005             {
5006               need_ref_all = true;
5007               break;
5008             }
5009           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5010         }
5011       while (sinfo);
5012     }
5013   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5014                                                need_ref_all);
5015   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5016
5017
5018   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5019      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5020      def-use update cycles for the pointer: one relative to the outer-loop
5021      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5022      to the inner-loop (which is the inner-most loop containing the dataref),
5023      and this is done be step (5) below.
5024
5025      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5026      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5027      redundant.  Steps (3),(4) create the following:
5028
5029         vp0 = &base_addr;
5030         LOOP:   vp1 = phi(vp0,vp2)
5031                 ...
5032                 ...
5033                 vp2 = vp1 + step
5034                 goto LOOP
5035
5036      If there is an inner-loop nested in loop, then step (5) will also be
5037      applied, and an additional update in the inner-loop will be created:
5038
5039         vp0 = &base_addr;
5040         LOOP:   vp1 = phi(vp0,vp2)
5041                 ...
5042         inner:     vp3 = phi(vp1,vp4)
5043                    vp4 = vp3 + inner_step
5044                    if () goto inner
5045                 ...
5046                 vp2 = vp1 + step
5047                 if () goto LOOP   */
5048
5049   /* (2) Calculate the initial address of the aggregate-pointer, and set
5050      the aggregate-pointer to point to it before the loop.  */
5051
5052   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5053
5054   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5055                                                    stmt_info, &new_stmt_list,
5056                                                    offset);
5057   if (new_stmt_list)
5058     {
5059       if (pe)
5060         {
5061           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5062           gcc_assert (!new_bb);
5063         }
5064       else
5065         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5066     }
5067
5068   *initial_address = new_temp;
5069   aggr_ptr_init = new_temp;
5070
5071   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5072      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5073      inner-loop nested in LOOP (during outer-loop vectorization).  */
5074
5075   /* No update in loop is required.  */
5076   if (only_init && (!loop_vinfo || at_loop == loop))
5077     aptr = aggr_ptr_init;
5078   else
5079     {
5080       /* Accesses to invariant addresses should be handled specially
5081          by the caller.  */
5082       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5083       gcc_assert (!integer_zerop (step));
5084
5085       if (iv_step == NULL_TREE)
5086         {
5087           /* The step of the aggregate pointer is the type size,
5088              negated for downward accesses.  */
5089           iv_step = TYPE_SIZE_UNIT (aggr_type);
5090           if (tree_int_cst_sgn (step) == -1)
5091             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5092         }
5093
5094       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5095
5096       create_iv (aggr_ptr_init,
5097                  fold_convert (aggr_ptr_type, iv_step),
5098                  aggr_ptr, loop, &incr_gsi, insert_after,
5099                  &indx_before_incr, &indx_after_incr);
5100       incr = gsi_stmt (incr_gsi);
5101
5102       /* Copy the points-to information if it exists. */
5103       if (DR_PTR_INFO (dr))
5104         {
5105           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5106           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5107         }
5108       if (ptr_incr)
5109         *ptr_incr = incr;
5110
5111       aptr = indx_before_incr;
5112     }
5113
5114   if (!nested_in_vect_loop || only_init)
5115     return aptr;
5116
5117
5118   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5119      nested in LOOP, if exists.  */
5120
5121   gcc_assert (nested_in_vect_loop);
5122   if (!only_init)
5123     {
5124       standard_iv_increment_position (containing_loop, &incr_gsi,
5125                                       &insert_after);
5126       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5127                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5128                  &indx_after_incr);
5129       incr = gsi_stmt (incr_gsi);
5130
5131       /* Copy the points-to information if it exists. */
5132       if (DR_PTR_INFO (dr))
5133         {
5134           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5135           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5136         }
5137       if (ptr_incr)
5138         *ptr_incr = incr;
5139
5140       return indx_before_incr;
5141     }
5142   else
5143     gcc_unreachable ();
5144 }
5145
5146
5147 /* Function bump_vector_ptr
5148
5149    Increment a pointer (to a vector type) by vector-size. If requested,
5150    i.e. if PTR-INCR is given, then also connect the new increment stmt
5151    to the existing def-use update-chain of the pointer, by modifying
5152    the PTR_INCR as illustrated below:
5153
5154    The pointer def-use update-chain before this function:
5155                         DATAREF_PTR = phi (p_0, p_2)
5156                         ....
5157         PTR_INCR:       p_2 = DATAREF_PTR + step
5158
5159    The pointer def-use update-chain after this function:
5160                         DATAREF_PTR = phi (p_0, p_2)
5161                         ....
5162                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5163                         ....
5164         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5165
5166    Input:
5167    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5168                  in the loop.
5169    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5170               the loop.  The increment amount across iterations is expected
5171               to be vector_size.
5172    BSI - location where the new update stmt is to be placed.
5173    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5174    BUMP - optional. The offset by which to bump the pointer. If not given,
5175           the offset is assumed to be vector_size.
5176
5177    Output: Return NEW_DATAREF_PTR as illustrated above.
5178
5179 */
5180
5181 tree
5182 bump_vector_ptr (vec_info *vinfo,
5183                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5184                  stmt_vec_info stmt_info, tree bump)
5185 {
5186   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5187   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5188   tree update = TYPE_SIZE_UNIT (vectype);
5189   gimple *incr_stmt;
5190   ssa_op_iter iter;
5191   use_operand_p use_p;
5192   tree new_dataref_ptr;
5193
5194   if (bump)
5195     update = bump;
5196
5197   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5198     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5199   else if (is_gimple_min_invariant (dataref_ptr))
5200     /* When possible avoid emitting a separate increment stmt that will
5201        force the addressed object addressable.  */
5202     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5203                    fold_build2 (MEM_REF,
5204                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5205                                 dataref_ptr,
5206                                 fold_convert (ptr_type_node, update)));
5207   else
5208     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5209   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5210                                    dataref_ptr, update);
5211   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5212   /* Fold the increment, avoiding excessive chains use-def chains of
5213      those, leading to compile-time issues for passes until the next
5214      forwprop pass which would do this as well.  */
5215   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5216   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5217     {
5218       incr_stmt = gsi_stmt (fold_gsi);
5219       update_stmt (incr_stmt);
5220     }
5221
5222   /* Copy the points-to information if it exists. */
5223   if (DR_PTR_INFO (dr))
5224     {
5225       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5226       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5227     }
5228
5229   if (!ptr_incr)
5230     return new_dataref_ptr;
5231
5232   /* Update the vector-pointer's cross-iteration increment.  */
5233   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5234     {
5235       tree use = USE_FROM_PTR (use_p);
5236
5237       if (use == dataref_ptr)
5238         SET_USE (use_p, new_dataref_ptr);
5239       else
5240         gcc_assert (operand_equal_p (use, update, 0));
5241     }
5242
5243   return new_dataref_ptr;
5244 }
5245
5246
5247 /* Copy memory reference info such as base/clique from the SRC reference
5248    to the DEST MEM_REF.  */
5249
5250 void
5251 vect_copy_ref_info (tree dest, tree src)
5252 {
5253   if (TREE_CODE (dest) != MEM_REF)
5254     return;
5255
5256   tree src_base = src;
5257   while (handled_component_p (src_base))
5258     src_base = TREE_OPERAND (src_base, 0);
5259   if (TREE_CODE (src_base) != MEM_REF
5260       && TREE_CODE (src_base) != TARGET_MEM_REF)
5261     return;
5262
5263   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5264   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5265 }
5266
5267
5268 /* Function vect_create_destination_var.
5269
5270    Create a new temporary of type VECTYPE.  */
5271
5272 tree
5273 vect_create_destination_var (tree scalar_dest, tree vectype)
5274 {
5275   tree vec_dest;
5276   const char *name;
5277   char *new_name;
5278   tree type;
5279   enum vect_var_kind kind;
5280
5281   kind = vectype
5282     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5283     ? vect_mask_var
5284     : vect_simple_var
5285     : vect_scalar_var;
5286   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5287
5288   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5289
5290   name = get_name (scalar_dest);
5291   if (name)
5292     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5293   else
5294     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5295   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5296   free (new_name);
5297
5298   return vec_dest;
5299 }
5300
5301 /* Function vect_grouped_store_supported.
5302
5303    Returns TRUE if interleave high and interleave low permutations
5304    are supported, and FALSE otherwise.  */
5305
5306 bool
5307 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5308 {
5309   machine_mode mode = TYPE_MODE (vectype);
5310
5311   /* vect_permute_store_chain requires the group size to be equal to 3 or
5312      be a power of two.  */
5313   if (count != 3 && exact_log2 (count) == -1)
5314     {
5315       if (dump_enabled_p ())
5316         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5317                          "the size of the group of accesses"
5318                          " is not a power of 2 or not eqaul to 3\n");
5319       return false;
5320     }
5321
5322   /* Check that the permutation is supported.  */
5323   if (VECTOR_MODE_P (mode))
5324     {
5325       unsigned int i;
5326       if (count == 3)
5327         {
5328           unsigned int j0 = 0, j1 = 0, j2 = 0;
5329           unsigned int i, j;
5330
5331           unsigned int nelt;
5332           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5333             {
5334               if (dump_enabled_p ())
5335                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5336                                  "cannot handle groups of 3 stores for"
5337                                  " variable-length vectors\n");
5338               return false;
5339             }
5340
5341           vec_perm_builder sel (nelt, nelt, 1);
5342           sel.quick_grow (nelt);
5343           vec_perm_indices indices;
5344           for (j = 0; j < 3; j++)
5345             {
5346               int nelt0 = ((3 - j) * nelt) % 3;
5347               int nelt1 = ((3 - j) * nelt + 1) % 3;
5348               int nelt2 = ((3 - j) * nelt + 2) % 3;
5349               for (i = 0; i < nelt; i++)
5350                 {
5351                   if (3 * i + nelt0 < nelt)
5352                     sel[3 * i + nelt0] = j0++;
5353                   if (3 * i + nelt1 < nelt)
5354                     sel[3 * i + nelt1] = nelt + j1++;
5355                   if (3 * i + nelt2 < nelt)
5356                     sel[3 * i + nelt2] = 0;
5357                 }
5358               indices.new_vector (sel, 2, nelt);
5359               if (!can_vec_perm_const_p (mode, mode, indices))
5360                 {
5361                   if (dump_enabled_p ())
5362                     dump_printf (MSG_MISSED_OPTIMIZATION,
5363                                  "permutation op not supported by target.\n");
5364                   return false;
5365                 }
5366
5367               for (i = 0; i < nelt; i++)
5368                 {
5369                   if (3 * i + nelt0 < nelt)
5370                     sel[3 * i + nelt0] = 3 * i + nelt0;
5371                   if (3 * i + nelt1 < nelt)
5372                     sel[3 * i + nelt1] = 3 * i + nelt1;
5373                   if (3 * i + nelt2 < nelt)
5374                     sel[3 * i + nelt2] = nelt + j2++;
5375                 }
5376               indices.new_vector (sel, 2, nelt);
5377               if (!can_vec_perm_const_p (mode, mode, indices))
5378                 {
5379                   if (dump_enabled_p ())
5380                     dump_printf (MSG_MISSED_OPTIMIZATION,
5381                                  "permutation op not supported by target.\n");
5382                   return false;
5383                 }
5384             }
5385           return true;
5386         }
5387       else
5388         {
5389           /* If length is not equal to 3 then only power of 2 is supported.  */
5390           gcc_assert (pow2p_hwi (count));
5391           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5392
5393           /* The encoding has 2 interleaved stepped patterns.  */
5394           vec_perm_builder sel (nelt, 2, 3);
5395           sel.quick_grow (6);
5396           for (i = 0; i < 3; i++)
5397             {
5398               sel[i * 2] = i;
5399               sel[i * 2 + 1] = i + nelt;
5400             }
5401           vec_perm_indices indices (sel, 2, nelt);
5402           if (can_vec_perm_const_p (mode, mode, indices))
5403             {
5404               for (i = 0; i < 6; i++)
5405                 sel[i] += exact_div (nelt, 2);
5406               indices.new_vector (sel, 2, nelt);
5407               if (can_vec_perm_const_p (mode, mode, indices))
5408                 return true;
5409             }
5410         }
5411     }
5412
5413   if (dump_enabled_p ())
5414     dump_printf (MSG_MISSED_OPTIMIZATION,
5415                  "permutation op not supported by target.\n");
5416   return false;
5417 }
5418
5419
5420 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5421    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5422
5423 bool
5424 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5425                             bool masked_p)
5426 {
5427   if (masked_p)
5428     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5429                                          vec_mask_store_lanes_optab,
5430                                          vectype, count);
5431   else
5432     return vect_lanes_optab_supported_p ("vec_store_lanes",
5433                                          vec_store_lanes_optab,
5434                                          vectype, count);
5435 }
5436
5437
5438 /* Function vect_permute_store_chain.
5439
5440    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5441    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5442    the data correctly for the stores.  Return the final references for stores
5443    in RESULT_CHAIN.
5444
5445    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5446    The input is 4 vectors each containing 8 elements.  We assign a number to
5447    each element, the input sequence is:
5448
5449    1st vec:   0  1  2  3  4  5  6  7
5450    2nd vec:   8  9 10 11 12 13 14 15
5451    3rd vec:  16 17 18 19 20 21 22 23
5452    4th vec:  24 25 26 27 28 29 30 31
5453
5454    The output sequence should be:
5455
5456    1st vec:  0  8 16 24  1  9 17 25
5457    2nd vec:  2 10 18 26  3 11 19 27
5458    3rd vec:  4 12 20 28  5 13 21 30
5459    4th vec:  6 14 22 30  7 15 23 31
5460
5461    i.e., we interleave the contents of the four vectors in their order.
5462
5463    We use interleave_high/low instructions to create such output.  The input of
5464    each interleave_high/low operation is two vectors:
5465    1st vec    2nd vec
5466    0 1 2 3    4 5 6 7
5467    the even elements of the result vector are obtained left-to-right from the
5468    high/low elements of the first vector.  The odd elements of the result are
5469    obtained left-to-right from the high/low elements of the second vector.
5470    The output of interleave_high will be:   0 4 1 5
5471    and of interleave_low:                   2 6 3 7
5472
5473
5474    The permutation is done in log LENGTH stages.  In each stage interleave_high
5475    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5476    where the first argument is taken from the first half of DR_CHAIN and the
5477    second argument from it's second half.
5478    In our example,
5479
5480    I1: interleave_high (1st vec, 3rd vec)
5481    I2: interleave_low (1st vec, 3rd vec)
5482    I3: interleave_high (2nd vec, 4th vec)
5483    I4: interleave_low (2nd vec, 4th vec)
5484
5485    The output for the first stage is:
5486
5487    I1:  0 16  1 17  2 18  3 19
5488    I2:  4 20  5 21  6 22  7 23
5489    I3:  8 24  9 25 10 26 11 27
5490    I4: 12 28 13 29 14 30 15 31
5491
5492    The output of the second stage, i.e. the final result is:
5493
5494    I1:  0  8 16 24  1  9 17 25
5495    I2:  2 10 18 26  3 11 19 27
5496    I3:  4 12 20 28  5 13 21 30
5497    I4:  6 14 22 30  7 15 23 31.  */
5498
5499 void
5500 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5501                           unsigned int length,
5502                           stmt_vec_info stmt_info,
5503                           gimple_stmt_iterator *gsi,
5504                           vec<tree> *result_chain)
5505 {
5506   tree vect1, vect2, high, low;
5507   gimple *perm_stmt;
5508   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5509   tree perm_mask_low, perm_mask_high;
5510   tree data_ref;
5511   tree perm3_mask_low, perm3_mask_high;
5512   unsigned int i, j, n, log_length = exact_log2 (length);
5513
5514   result_chain->quick_grow (length);
5515   memcpy (result_chain->address (), dr_chain.address (),
5516           length * sizeof (tree));
5517
5518   if (length == 3)
5519     {
5520       /* vect_grouped_store_supported ensures that this is constant.  */
5521       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5522       unsigned int j0 = 0, j1 = 0, j2 = 0;
5523
5524       vec_perm_builder sel (nelt, nelt, 1);
5525       sel.quick_grow (nelt);
5526       vec_perm_indices indices;
5527       for (j = 0; j < 3; j++)
5528         {
5529           int nelt0 = ((3 - j) * nelt) % 3;
5530           int nelt1 = ((3 - j) * nelt + 1) % 3;
5531           int nelt2 = ((3 - j) * nelt + 2) % 3;
5532
5533           for (i = 0; i < nelt; i++)
5534             {
5535               if (3 * i + nelt0 < nelt)
5536                 sel[3 * i + nelt0] = j0++;
5537               if (3 * i + nelt1 < nelt)
5538                 sel[3 * i + nelt1] = nelt + j1++;
5539               if (3 * i + nelt2 < nelt)
5540                 sel[3 * i + nelt2] = 0;
5541             }
5542           indices.new_vector (sel, 2, nelt);
5543           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5544
5545           for (i = 0; i < nelt; i++)
5546             {
5547               if (3 * i + nelt0 < nelt)
5548                 sel[3 * i + nelt0] = 3 * i + nelt0;
5549               if (3 * i + nelt1 < nelt)
5550                 sel[3 * i + nelt1] = 3 * i + nelt1;
5551               if (3 * i + nelt2 < nelt)
5552                 sel[3 * i + nelt2] = nelt + j2++;
5553             }
5554           indices.new_vector (sel, 2, nelt);
5555           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5556
5557           vect1 = dr_chain[0];
5558           vect2 = dr_chain[1];
5559
5560           /* Create interleaving stmt:
5561              low = VEC_PERM_EXPR <vect1, vect2,
5562                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5563                                    j + 2, nelt + j + 2, *, ...}>  */
5564           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5565           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5566                                            vect2, perm3_mask_low);
5567           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5568
5569           vect1 = data_ref;
5570           vect2 = dr_chain[2];
5571           /* Create interleaving stmt:
5572              low = VEC_PERM_EXPR <vect1, vect2,
5573                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5574                                    6, 7, nelt + j + 2, ...}>  */
5575           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5576           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5577                                            vect2, perm3_mask_high);
5578           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5579           (*result_chain)[j] = data_ref;
5580         }
5581     }
5582   else
5583     {
5584       /* If length is not equal to 3 then only power of 2 is supported.  */
5585       gcc_assert (pow2p_hwi (length));
5586
5587       /* The encoding has 2 interleaved stepped patterns.  */
5588       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5589       vec_perm_builder sel (nelt, 2, 3);
5590       sel.quick_grow (6);
5591       for (i = 0; i < 3; i++)
5592         {
5593           sel[i * 2] = i;
5594           sel[i * 2 + 1] = i + nelt;
5595         }
5596         vec_perm_indices indices (sel, 2, nelt);
5597         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5598
5599         for (i = 0; i < 6; i++)
5600           sel[i] += exact_div (nelt, 2);
5601         indices.new_vector (sel, 2, nelt);
5602         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5603
5604         for (i = 0, n = log_length; i < n; i++)
5605           {
5606             for (j = 0; j < length/2; j++)
5607               {
5608                 vect1 = dr_chain[j];
5609                 vect2 = dr_chain[j+length/2];
5610
5611                 /* Create interleaving stmt:
5612                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5613                                                         ...}>  */
5614                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5615                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5616                                                  vect2, perm_mask_high);
5617                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5618                 (*result_chain)[2*j] = high;
5619
5620                 /* Create interleaving stmt:
5621                    low = VEC_PERM_EXPR <vect1, vect2,
5622                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5623                                          ...}>  */
5624                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5625                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5626                                                  vect2, perm_mask_low);
5627                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5628                 (*result_chain)[2*j+1] = low;
5629               }
5630             memcpy (dr_chain.address (), result_chain->address (),
5631                     length * sizeof (tree));
5632           }
5633     }
5634 }
5635
5636 /* Function vect_setup_realignment
5637
5638    This function is called when vectorizing an unaligned load using
5639    the dr_explicit_realign[_optimized] scheme.
5640    This function generates the following code at the loop prolog:
5641
5642       p = initial_addr;
5643    x  msq_init = *(floor(p));   # prolog load
5644       realignment_token = call target_builtin;
5645     loop:
5646    x  msq = phi (msq_init, ---)
5647
5648    The stmts marked with x are generated only for the case of
5649    dr_explicit_realign_optimized.
5650
5651    The code above sets up a new (vector) pointer, pointing to the first
5652    location accessed by STMT_INFO, and a "floor-aligned" load using that
5653    pointer.  It also generates code to compute the "realignment-token"
5654    (if the relevant target hook was defined), and creates a phi-node at the
5655    loop-header bb whose arguments are the result of the prolog-load (created
5656    by this function) and the result of a load that takes place in the loop
5657    (to be created by the caller to this function).
5658
5659    For the case of dr_explicit_realign_optimized:
5660    The caller to this function uses the phi-result (msq) to create the
5661    realignment code inside the loop, and sets up the missing phi argument,
5662    as follows:
5663     loop:
5664       msq = phi (msq_init, lsq)
5665       lsq = *(floor(p'));        # load in loop
5666       result = realign_load (msq, lsq, realignment_token);
5667
5668    For the case of dr_explicit_realign:
5669     loop:
5670       msq = *(floor(p));        # load in loop
5671       p' = p + (VS-1);
5672       lsq = *(floor(p'));       # load in loop
5673       result = realign_load (msq, lsq, realignment_token);
5674
5675    Input:
5676    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5677                a memory location that may be unaligned.
5678    BSI - place where new code is to be inserted.
5679    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5680                               is used.
5681
5682    Output:
5683    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5684                        target hook, if defined.
5685    Return value - the result of the loop-header phi node.  */
5686
5687 tree
5688 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5689                         gimple_stmt_iterator *gsi, tree *realignment_token,
5690                         enum dr_alignment_support alignment_support_scheme,
5691                         tree init_addr,
5692                         class loop **at_loop)
5693 {
5694   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5695   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5696   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5697   struct data_reference *dr = dr_info->dr;
5698   class loop *loop = NULL;
5699   edge pe = NULL;
5700   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5701   tree vec_dest;
5702   gimple *inc;
5703   tree ptr;
5704   tree data_ref;
5705   basic_block new_bb;
5706   tree msq_init = NULL_TREE;
5707   tree new_temp;
5708   gphi *phi_stmt;
5709   tree msq = NULL_TREE;
5710   gimple_seq stmts = NULL;
5711   bool compute_in_loop = false;
5712   bool nested_in_vect_loop = false;
5713   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5714   class loop *loop_for_initial_load = NULL;
5715
5716   if (loop_vinfo)
5717     {
5718       loop = LOOP_VINFO_LOOP (loop_vinfo);
5719       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5720     }
5721
5722   gcc_assert (alignment_support_scheme == dr_explicit_realign
5723               || alignment_support_scheme == dr_explicit_realign_optimized);
5724
5725   /* We need to generate three things:
5726      1. the misalignment computation
5727      2. the extra vector load (for the optimized realignment scheme).
5728      3. the phi node for the two vectors from which the realignment is
5729       done (for the optimized realignment scheme).  */
5730
5731   /* 1. Determine where to generate the misalignment computation.
5732
5733      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5734      calculation will be generated by this function, outside the loop (in the
5735      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5736      caller, inside the loop.
5737
5738      Background: If the misalignment remains fixed throughout the iterations of
5739      the loop, then both realignment schemes are applicable, and also the
5740      misalignment computation can be done outside LOOP.  This is because we are
5741      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5742      are a multiple of VS (the Vector Size), and therefore the misalignment in
5743      different vectorized LOOP iterations is always the same.
5744      The problem arises only if the memory access is in an inner-loop nested
5745      inside LOOP, which is now being vectorized using outer-loop vectorization.
5746      This is the only case when the misalignment of the memory access may not
5747      remain fixed throughout the iterations of the inner-loop (as explained in
5748      detail in vect_supportable_dr_alignment).  In this case, not only is the
5749      optimized realignment scheme not applicable, but also the misalignment
5750      computation (and generation of the realignment token that is passed to
5751      REALIGN_LOAD) have to be done inside the loop.
5752
5753      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5754      or not, which in turn determines if the misalignment is computed inside
5755      the inner-loop, or outside LOOP.  */
5756
5757   if (init_addr != NULL_TREE || !loop_vinfo)
5758     {
5759       compute_in_loop = true;
5760       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5761     }
5762
5763
5764   /* 2. Determine where to generate the extra vector load.
5765
5766      For the optimized realignment scheme, instead of generating two vector
5767      loads in each iteration, we generate a single extra vector load in the
5768      preheader of the loop, and in each iteration reuse the result of the
5769      vector load from the previous iteration.  In case the memory access is in
5770      an inner-loop nested inside LOOP, which is now being vectorized using
5771      outer-loop vectorization, we need to determine whether this initial vector
5772      load should be generated at the preheader of the inner-loop, or can be
5773      generated at the preheader of LOOP.  If the memory access has no evolution
5774      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5775      to be generated inside LOOP (in the preheader of the inner-loop).  */
5776
5777   if (nested_in_vect_loop)
5778     {
5779       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5780       bool invariant_in_outerloop =
5781             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5782       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5783     }
5784   else
5785     loop_for_initial_load = loop;
5786   if (at_loop)
5787     *at_loop = loop_for_initial_load;
5788
5789   tree vuse = NULL_TREE;
5790   if (loop_for_initial_load)
5791     {
5792       pe = loop_preheader_edge (loop_for_initial_load);
5793       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
5794         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
5795     }
5796   if (!vuse)
5797     vuse = gimple_vuse (gsi_stmt (*gsi));
5798
5799   /* 3. For the case of the optimized realignment, create the first vector
5800       load at the loop preheader.  */
5801
5802   if (alignment_support_scheme == dr_explicit_realign_optimized)
5803     {
5804       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5805       gassign *new_stmt;
5806
5807       gcc_assert (!compute_in_loop);
5808       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5809       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5810                                       loop_for_initial_load, NULL_TREE,
5811                                       &init_addr, NULL, &inc, true);
5812       if (TREE_CODE (ptr) == SSA_NAME)
5813         new_temp = copy_ssa_name (ptr);
5814       else
5815         new_temp = make_ssa_name (TREE_TYPE (ptr));
5816       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5817       tree type = TREE_TYPE (ptr);
5818       new_stmt = gimple_build_assign
5819                    (new_temp, BIT_AND_EXPR, ptr,
5820                     fold_build2 (MINUS_EXPR, type,
5821                                  build_int_cst (type, 0),
5822                                  build_int_cst (type, align)));
5823       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5824       gcc_assert (!new_bb);
5825       data_ref
5826         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5827                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5828       vect_copy_ref_info (data_ref, DR_REF (dr));
5829       new_stmt = gimple_build_assign (vec_dest, data_ref);
5830       new_temp = make_ssa_name (vec_dest, new_stmt);
5831       gimple_assign_set_lhs (new_stmt, new_temp);
5832       gimple_set_vuse (new_stmt, vuse);
5833       if (pe)
5834         {
5835           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5836           gcc_assert (!new_bb);
5837         }
5838       else
5839          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5840
5841       msq_init = gimple_assign_lhs (new_stmt);
5842     }
5843
5844   /* 4. Create realignment token using a target builtin, if available.
5845       It is done either inside the containing loop, or before LOOP (as
5846       determined above).  */
5847
5848   if (targetm.vectorize.builtin_mask_for_load)
5849     {
5850       gcall *new_stmt;
5851       tree builtin_decl;
5852
5853       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5854       if (!init_addr)
5855         {
5856           /* Generate the INIT_ADDR computation outside LOOP.  */
5857           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5858                                                             stmt_info, &stmts,
5859                                                             NULL_TREE);
5860           if (loop)
5861             {
5862               pe = loop_preheader_edge (loop);
5863               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5864               gcc_assert (!new_bb);
5865             }
5866           else
5867              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5868         }
5869
5870       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5871       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5872       vec_dest =
5873         vect_create_destination_var (scalar_dest,
5874                                      gimple_call_return_type (new_stmt));
5875       new_temp = make_ssa_name (vec_dest, new_stmt);
5876       gimple_call_set_lhs (new_stmt, new_temp);
5877
5878       if (compute_in_loop)
5879         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5880       else
5881         {
5882           /* Generate the misalignment computation outside LOOP.  */
5883           pe = loop_preheader_edge (loop);
5884           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5885           gcc_assert (!new_bb);
5886         }
5887
5888       *realignment_token = gimple_call_lhs (new_stmt);
5889
5890       /* The result of the CALL_EXPR to this builtin is determined from
5891          the value of the parameter and no global variables are touched
5892          which makes the builtin a "const" function.  Requiring the
5893          builtin to have the "const" attribute makes it unnecessary
5894          to call mark_call_clobbered.  */
5895       gcc_assert (TREE_READONLY (builtin_decl));
5896     }
5897
5898   if (alignment_support_scheme == dr_explicit_realign)
5899     return msq;
5900
5901   gcc_assert (!compute_in_loop);
5902   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5903
5904
5905   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5906
5907   pe = loop_preheader_edge (containing_loop);
5908   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5909   msq = make_ssa_name (vec_dest);
5910   phi_stmt = create_phi_node (msq, containing_loop->header);
5911   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5912
5913   return msq;
5914 }
5915
5916
5917 /* Function vect_grouped_load_supported.
5918
5919    COUNT is the size of the load group (the number of statements plus the
5920    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5921    only one statement, with a gap of COUNT - 1.
5922
5923    Returns true if a suitable permute exists.  */
5924
5925 bool
5926 vect_grouped_load_supported (tree vectype, bool single_element_p,
5927                              unsigned HOST_WIDE_INT count)
5928 {
5929   machine_mode mode = TYPE_MODE (vectype);
5930
5931   /* If this is single-element interleaving with an element distance
5932      that leaves unused vector loads around punt - we at least create
5933      very sub-optimal code in that case (and blow up memory,
5934      see PR65518).  */
5935   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5936     {
5937       if (dump_enabled_p ())
5938         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5939                          "single-element interleaving not supported "
5940                          "for not adjacent vector loads\n");
5941       return false;
5942     }
5943
5944   /* vect_permute_load_chain requires the group size to be equal to 3 or
5945      be a power of two.  */
5946   if (count != 3 && exact_log2 (count) == -1)
5947     {
5948       if (dump_enabled_p ())
5949         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5950                          "the size of the group of accesses"
5951                          " is not a power of 2 or not equal to 3\n");
5952       return false;
5953     }
5954
5955   /* Check that the permutation is supported.  */
5956   if (VECTOR_MODE_P (mode))
5957     {
5958       unsigned int i, j;
5959       if (count == 3)
5960         {
5961           unsigned int nelt;
5962           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5963             {
5964               if (dump_enabled_p ())
5965                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5966                                  "cannot handle groups of 3 loads for"
5967                                  " variable-length vectors\n");
5968               return false;
5969             }
5970
5971           vec_perm_builder sel (nelt, nelt, 1);
5972           sel.quick_grow (nelt);
5973           vec_perm_indices indices;
5974           unsigned int k;
5975           for (k = 0; k < 3; k++)
5976             {
5977               for (i = 0; i < nelt; i++)
5978                 if (3 * i + k < 2 * nelt)
5979                   sel[i] = 3 * i + k;
5980                 else
5981                   sel[i] = 0;
5982               indices.new_vector (sel, 2, nelt);
5983               if (!can_vec_perm_const_p (mode, mode, indices))
5984                 {
5985                   if (dump_enabled_p ())
5986                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5987                                      "shuffle of 3 loads is not supported by"
5988                                      " target\n");
5989                   return false;
5990                 }
5991               for (i = 0, j = 0; i < nelt; i++)
5992                 if (3 * i + k < 2 * nelt)
5993                   sel[i] = i;
5994                 else
5995                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5996               indices.new_vector (sel, 2, nelt);
5997               if (!can_vec_perm_const_p (mode, mode, indices))
5998                 {
5999                   if (dump_enabled_p ())
6000                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6001                                      "shuffle of 3 loads is not supported by"
6002                                      " target\n");
6003                   return false;
6004                 }
6005             }
6006           return true;
6007         }
6008       else
6009         {
6010           /* If length is not equal to 3 then only power of 2 is supported.  */
6011           gcc_assert (pow2p_hwi (count));
6012           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6013
6014           /* The encoding has a single stepped pattern.  */
6015           vec_perm_builder sel (nelt, 1, 3);
6016           sel.quick_grow (3);
6017           for (i = 0; i < 3; i++)
6018             sel[i] = i * 2;
6019           vec_perm_indices indices (sel, 2, nelt);
6020           if (can_vec_perm_const_p (mode, mode, indices))
6021             {
6022               for (i = 0; i < 3; i++)
6023                 sel[i] = i * 2 + 1;
6024               indices.new_vector (sel, 2, nelt);
6025               if (can_vec_perm_const_p (mode, mode, indices))
6026                 return true;
6027             }
6028         }
6029     }
6030
6031   if (dump_enabled_p ())
6032     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6033                      "extract even/odd not supported by target\n");
6034   return false;
6035 }
6036
6037 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6038    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6039
6040 bool
6041 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6042                            bool masked_p)
6043 {
6044   if (masked_p)
6045     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6046                                          vec_mask_load_lanes_optab,
6047                                          vectype, count);
6048   else
6049     return vect_lanes_optab_supported_p ("vec_load_lanes",
6050                                          vec_load_lanes_optab,
6051                                          vectype, count);
6052 }
6053
6054 /* Function vect_permute_load_chain.
6055
6056    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6057    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6058    the input data correctly.  Return the final references for loads in
6059    RESULT_CHAIN.
6060
6061    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6062    The input is 4 vectors each containing 8 elements. We assign a number to each
6063    element, the input sequence is:
6064
6065    1st vec:   0  1  2  3  4  5  6  7
6066    2nd vec:   8  9 10 11 12 13 14 15
6067    3rd vec:  16 17 18 19 20 21 22 23
6068    4th vec:  24 25 26 27 28 29 30 31
6069
6070    The output sequence should be:
6071
6072    1st vec:  0 4  8 12 16 20 24 28
6073    2nd vec:  1 5  9 13 17 21 25 29
6074    3rd vec:  2 6 10 14 18 22 26 30
6075    4th vec:  3 7 11 15 19 23 27 31
6076
6077    i.e., the first output vector should contain the first elements of each
6078    interleaving group, etc.
6079
6080    We use extract_even/odd instructions to create such output.  The input of
6081    each extract_even/odd operation is two vectors
6082    1st vec    2nd vec
6083    0 1 2 3    4 5 6 7
6084
6085    and the output is the vector of extracted even/odd elements.  The output of
6086    extract_even will be:   0 2 4 6
6087    and of extract_odd:     1 3 5 7
6088
6089
6090    The permutation is done in log LENGTH stages.  In each stage extract_even
6091    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6092    their order.  In our example,
6093
6094    E1: extract_even (1st vec, 2nd vec)
6095    E2: extract_odd (1st vec, 2nd vec)
6096    E3: extract_even (3rd vec, 4th vec)
6097    E4: extract_odd (3rd vec, 4th vec)
6098
6099    The output for the first stage will be:
6100
6101    E1:  0  2  4  6  8 10 12 14
6102    E2:  1  3  5  7  9 11 13 15
6103    E3: 16 18 20 22 24 26 28 30
6104    E4: 17 19 21 23 25 27 29 31
6105
6106    In order to proceed and create the correct sequence for the next stage (or
6107    for the correct output, if the second stage is the last one, as in our
6108    example), we first put the output of extract_even operation and then the
6109    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6110    The input for the second stage is:
6111
6112    1st vec (E1):  0  2  4  6  8 10 12 14
6113    2nd vec (E3): 16 18 20 22 24 26 28 30
6114    3rd vec (E2):  1  3  5  7  9 11 13 15
6115    4th vec (E4): 17 19 21 23 25 27 29 31
6116
6117    The output of the second stage:
6118
6119    E1: 0 4  8 12 16 20 24 28
6120    E2: 2 6 10 14 18 22 26 30
6121    E3: 1 5  9 13 17 21 25 29
6122    E4: 3 7 11 15 19 23 27 31
6123
6124    And RESULT_CHAIN after reordering:
6125
6126    1st vec (E1):  0 4  8 12 16 20 24 28
6127    2nd vec (E3):  1 5  9 13 17 21 25 29
6128    3rd vec (E2):  2 6 10 14 18 22 26 30
6129    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6130
6131 static void
6132 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6133                          unsigned int length,
6134                          stmt_vec_info stmt_info,
6135                          gimple_stmt_iterator *gsi,
6136                          vec<tree> *result_chain)
6137 {
6138   tree data_ref, first_vect, second_vect;
6139   tree perm_mask_even, perm_mask_odd;
6140   tree perm3_mask_low, perm3_mask_high;
6141   gimple *perm_stmt;
6142   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6143   unsigned int i, j, log_length = exact_log2 (length);
6144
6145   result_chain->quick_grow (length);
6146   memcpy (result_chain->address (), dr_chain.address (),
6147           length * sizeof (tree));
6148
6149   if (length == 3)
6150     {
6151       /* vect_grouped_load_supported ensures that this is constant.  */
6152       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6153       unsigned int k;
6154
6155       vec_perm_builder sel (nelt, nelt, 1);
6156       sel.quick_grow (nelt);
6157       vec_perm_indices indices;
6158       for (k = 0; k < 3; k++)
6159         {
6160           for (i = 0; i < nelt; i++)
6161             if (3 * i + k < 2 * nelt)
6162               sel[i] = 3 * i + k;
6163             else
6164               sel[i] = 0;
6165           indices.new_vector (sel, 2, nelt);
6166           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6167
6168           for (i = 0, j = 0; i < nelt; i++)
6169             if (3 * i + k < 2 * nelt)
6170               sel[i] = i;
6171             else
6172               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6173           indices.new_vector (sel, 2, nelt);
6174           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6175
6176           first_vect = dr_chain[0];
6177           second_vect = dr_chain[1];
6178
6179           /* Create interleaving stmt (low part of):
6180              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6181                                                              ...}>  */
6182           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6183           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6184                                            second_vect, perm3_mask_low);
6185           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6186
6187           /* Create interleaving stmt (high part of):
6188              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6189                                                               ...}>  */
6190           first_vect = data_ref;
6191           second_vect = dr_chain[2];
6192           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6193           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6194                                            second_vect, perm3_mask_high);
6195           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6196           (*result_chain)[k] = data_ref;
6197         }
6198     }
6199   else
6200     {
6201       /* If length is not equal to 3 then only power of 2 is supported.  */
6202       gcc_assert (pow2p_hwi (length));
6203
6204       /* The encoding has a single stepped pattern.  */
6205       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6206       vec_perm_builder sel (nelt, 1, 3);
6207       sel.quick_grow (3);
6208       for (i = 0; i < 3; ++i)
6209         sel[i] = i * 2;
6210       vec_perm_indices indices (sel, 2, nelt);
6211       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6212
6213       for (i = 0; i < 3; ++i)
6214         sel[i] = i * 2 + 1;
6215       indices.new_vector (sel, 2, nelt);
6216       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6217
6218       for (i = 0; i < log_length; i++)
6219         {
6220           for (j = 0; j < length; j += 2)
6221             {
6222               first_vect = dr_chain[j];
6223               second_vect = dr_chain[j+1];
6224
6225               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6226               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6227               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6228                                                first_vect, second_vect,
6229                                                perm_mask_even);
6230               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6231               (*result_chain)[j/2] = data_ref;
6232
6233               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6234               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6235               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6236                                                first_vect, second_vect,
6237                                                perm_mask_odd);
6238               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6239               (*result_chain)[j/2+length/2] = data_ref;
6240             }
6241           memcpy (dr_chain.address (), result_chain->address (),
6242                   length * sizeof (tree));
6243         }
6244     }
6245 }
6246
6247 /* Function vect_shift_permute_load_chain.
6248
6249    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6250    sequence of stmts to reorder the input data accordingly.
6251    Return the final references for loads in RESULT_CHAIN.
6252    Return true if successed, false otherwise.
6253
6254    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6255    The input is 3 vectors each containing 8 elements.  We assign a
6256    number to each element, the input sequence is:
6257
6258    1st vec:   0  1  2  3  4  5  6  7
6259    2nd vec:   8  9 10 11 12 13 14 15
6260    3rd vec:  16 17 18 19 20 21 22 23
6261
6262    The output sequence should be:
6263
6264    1st vec:  0 3 6  9 12 15 18 21
6265    2nd vec:  1 4 7 10 13 16 19 22
6266    3rd vec:  2 5 8 11 14 17 20 23
6267
6268    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6269
6270    First we shuffle all 3 vectors to get correct elements order:
6271
6272    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6273    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6274    3rd vec:  (16 19 22) (17 20 23) (18 21)
6275
6276    Next we unite and shift vector 3 times:
6277
6278    1st step:
6279      shift right by 6 the concatenation of:
6280      "1st vec" and  "2nd vec"
6281        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6282      "2nd vec" and  "3rd vec"
6283        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6284      "3rd vec" and  "1st vec"
6285        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6286                              | New vectors                   |
6287
6288      So that now new vectors are:
6289
6290      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6291      2nd vec:  (10 13) (16 19 22) (17 20 23)
6292      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6293
6294    2nd step:
6295      shift right by 5 the concatenation of:
6296      "1st vec" and  "3rd vec"
6297        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6298      "2nd vec" and  "1st vec"
6299        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6300      "3rd vec" and  "2nd vec"
6301        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6302                           | New vectors                   |
6303
6304      So that now new vectors are:
6305
6306      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6307      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6308      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6309
6310    3rd step:
6311      shift right by 5 the concatenation of:
6312      "1st vec" and  "1st vec"
6313        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6314      shift right by 3 the concatenation of:
6315      "2nd vec" and  "2nd vec"
6316                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6317                           | New vectors                   |
6318
6319      So that now all vectors are READY:
6320      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6321      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6322      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6323
6324    This algorithm is faster than one in vect_permute_load_chain if:
6325      1.  "shift of a concatination" is faster than general permutation.
6326          This is usually so.
6327      2.  The TARGET machine can't execute vector instructions in parallel.
6328          This is because each step of the algorithm depends on previous.
6329          The algorithm in vect_permute_load_chain is much more parallel.
6330
6331    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6332 */
6333
6334 static bool
6335 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6336                                unsigned int length,
6337                                stmt_vec_info stmt_info,
6338                                gimple_stmt_iterator *gsi,
6339                                vec<tree> *result_chain)
6340 {
6341   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6342   tree perm2_mask1, perm2_mask2, perm3_mask;
6343   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6344   gimple *perm_stmt;
6345
6346   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6347   machine_mode vmode = TYPE_MODE (vectype);
6348   unsigned int i;
6349   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6350
6351   unsigned HOST_WIDE_INT nelt, vf;
6352   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6353       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6354     /* Not supported for variable-length vectors.  */
6355     return false;
6356
6357   vec_perm_builder sel (nelt, nelt, 1);
6358   sel.quick_grow (nelt);
6359
6360   result_chain->quick_grow (length);
6361   memcpy (result_chain->address (), dr_chain.address (),
6362           length * sizeof (tree));
6363
6364   if (pow2p_hwi (length) && vf > 4)
6365     {
6366       unsigned int j, log_length = exact_log2 (length);
6367       for (i = 0; i < nelt / 2; ++i)
6368         sel[i] = i * 2;
6369       for (i = 0; i < nelt / 2; ++i)
6370         sel[nelt / 2 + i] = i * 2 + 1;
6371       vec_perm_indices indices (sel, 2, nelt);
6372       if (!can_vec_perm_const_p (vmode, vmode, indices))
6373         {
6374           if (dump_enabled_p ())
6375             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6376                              "shuffle of 2 fields structure is not \
6377                               supported by target\n");
6378           return false;
6379         }
6380       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6381
6382       for (i = 0; i < nelt / 2; ++i)
6383         sel[i] = i * 2 + 1;
6384       for (i = 0; i < nelt / 2; ++i)
6385         sel[nelt / 2 + i] = i * 2;
6386       indices.new_vector (sel, 2, nelt);
6387       if (!can_vec_perm_const_p (vmode, vmode, indices))
6388         {
6389           if (dump_enabled_p ())
6390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6391                              "shuffle of 2 fields structure is not \
6392                               supported by target\n");
6393           return false;
6394         }
6395       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6396
6397       /* Generating permutation constant to shift all elements.
6398          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6399       for (i = 0; i < nelt; i++)
6400         sel[i] = nelt / 2 + i;
6401       indices.new_vector (sel, 2, nelt);
6402       if (!can_vec_perm_const_p (vmode, vmode, indices))
6403         {
6404           if (dump_enabled_p ())
6405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6406                              "shift permutation is not supported by target\n");
6407           return false;
6408         }
6409       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6410
6411       /* Generating permutation constant to select vector from 2.
6412          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6413       for (i = 0; i < nelt / 2; i++)
6414         sel[i] = i;
6415       for (i = nelt / 2; i < nelt; i++)
6416         sel[i] = nelt + i;
6417       indices.new_vector (sel, 2, nelt);
6418       if (!can_vec_perm_const_p (vmode, vmode, indices))
6419         {
6420           if (dump_enabled_p ())
6421             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6422                              "select is not supported by target\n");
6423           return false;
6424         }
6425       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6426
6427       for (i = 0; i < log_length; i++)
6428         {
6429           for (j = 0; j < length; j += 2)
6430             {
6431               first_vect = dr_chain[j];
6432               second_vect = dr_chain[j + 1];
6433
6434               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6435               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6436                                                first_vect, first_vect,
6437                                                perm2_mask1);
6438               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6439               vect[0] = data_ref;
6440
6441               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6442               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6443                                                second_vect, second_vect,
6444                                                perm2_mask2);
6445               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6446               vect[1] = data_ref;
6447
6448               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6449               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6450                                                vect[0], vect[1], shift1_mask);
6451               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6452               (*result_chain)[j/2 + length/2] = data_ref;
6453
6454               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6455               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6456                                                vect[0], vect[1], select_mask);
6457               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6458               (*result_chain)[j/2] = data_ref;
6459             }
6460           memcpy (dr_chain.address (), result_chain->address (),
6461                   length * sizeof (tree));
6462         }
6463       return true;
6464     }
6465   if (length == 3 && vf > 2)
6466     {
6467       unsigned int k = 0, l = 0;
6468
6469       /* Generating permutation constant to get all elements in rigth order.
6470          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6471       for (i = 0; i < nelt; i++)
6472         {
6473           if (3 * k + (l % 3) >= nelt)
6474             {
6475               k = 0;
6476               l += (3 - (nelt % 3));
6477             }
6478           sel[i] = 3 * k + (l % 3);
6479           k++;
6480         }
6481       vec_perm_indices indices (sel, 2, nelt);
6482       if (!can_vec_perm_const_p (vmode, vmode, indices))
6483         {
6484           if (dump_enabled_p ())
6485             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6486                              "shuffle of 3 fields structure is not \
6487                               supported by target\n");
6488           return false;
6489         }
6490       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6491
6492       /* Generating permutation constant to shift all elements.
6493          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6494       for (i = 0; i < nelt; i++)
6495         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6496       indices.new_vector (sel, 2, nelt);
6497       if (!can_vec_perm_const_p (vmode, vmode, indices))
6498         {
6499           if (dump_enabled_p ())
6500             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6501                              "shift permutation is not supported by target\n");
6502           return false;
6503         }
6504       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6505
6506       /* Generating permutation constant to shift all elements.
6507          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6508       for (i = 0; i < nelt; i++)
6509         sel[i] = 2 * (nelt / 3) + 1 + i;
6510       indices.new_vector (sel, 2, nelt);
6511       if (!can_vec_perm_const_p (vmode, vmode, indices))
6512         {
6513           if (dump_enabled_p ())
6514             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6515                              "shift permutation is not supported by target\n");
6516           return false;
6517         }
6518       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6519
6520       /* Generating permutation constant to shift all elements.
6521          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6522       for (i = 0; i < nelt; i++)
6523         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6524       indices.new_vector (sel, 2, nelt);
6525       if (!can_vec_perm_const_p (vmode, vmode, indices))
6526         {
6527           if (dump_enabled_p ())
6528             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6529                              "shift permutation is not supported by target\n");
6530           return false;
6531         }
6532       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6533
6534       /* Generating permutation constant to shift all elements.
6535          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6536       for (i = 0; i < nelt; i++)
6537         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6538       indices.new_vector (sel, 2, nelt);
6539       if (!can_vec_perm_const_p (vmode, vmode, indices))
6540         {
6541           if (dump_enabled_p ())
6542             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6543                              "shift permutation is not supported by target\n");
6544           return false;
6545         }
6546       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6547
6548       for (k = 0; k < 3; k++)
6549         {
6550           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6551           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6552                                            dr_chain[k], dr_chain[k],
6553                                            perm3_mask);
6554           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6555           vect[k] = data_ref;
6556         }
6557
6558       for (k = 0; k < 3; k++)
6559         {
6560           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6561           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6562                                            vect[k % 3], vect[(k + 1) % 3],
6563                                            shift1_mask);
6564           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6565           vect_shift[k] = data_ref;
6566         }
6567
6568       for (k = 0; k < 3; k++)
6569         {
6570           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6571           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6572                                            vect_shift[(4 - k) % 3],
6573                                            vect_shift[(3 - k) % 3],
6574                                            shift2_mask);
6575           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6576           vect[k] = data_ref;
6577         }
6578
6579       (*result_chain)[3 - (nelt % 3)] = vect[2];
6580
6581       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6582       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6583                                        vect[0], shift3_mask);
6584       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6585       (*result_chain)[nelt % 3] = data_ref;
6586
6587       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6588       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6589                                        vect[1], shift4_mask);
6590       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6591       (*result_chain)[0] = data_ref;
6592       return true;
6593     }
6594   return false;
6595 }
6596
6597 /* Function vect_transform_grouped_load.
6598
6599    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6600    to perform their permutation and ascribe the result vectorized statements to
6601    the scalar statements.
6602 */
6603
6604 void
6605 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6606                              vec<tree> dr_chain,
6607                              int size, gimple_stmt_iterator *gsi)
6608 {
6609   machine_mode mode;
6610   vec<tree> result_chain = vNULL;
6611
6612   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6613      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6614      vectors, that are ready for vector computation.  */
6615   result_chain.create (size);
6616
6617   /* If reassociation width for vector type is 2 or greater target machine can
6618      execute 2 or more vector instructions in parallel.  Otherwise try to
6619      get chain for loads group using vect_shift_permute_load_chain.  */
6620   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6621   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6622       || pow2p_hwi (size)
6623       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6624                                          gsi, &result_chain))
6625     vect_permute_load_chain (vinfo, dr_chain,
6626                              size, stmt_info, gsi, &result_chain);
6627   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6628   result_chain.release ();
6629 }
6630
6631 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6632    generated as part of the vectorization of STMT_INFO.  Assign the statement
6633    for each vector to the associated scalar statement.  */
6634
6635 void
6636 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6637                                   vec<tree> result_chain)
6638 {
6639   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6640   unsigned int i, gap_count;
6641   tree tmp_data_ref;
6642
6643   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6644      Since we scan the chain starting from it's first node, their order
6645      corresponds the order of data-refs in RESULT_CHAIN.  */
6646   stmt_vec_info next_stmt_info = first_stmt_info;
6647   gap_count = 1;
6648   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6649     {
6650       if (!next_stmt_info)
6651         break;
6652
6653       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6654        code elimination pass later.  No need to check for the first stmt in
6655        the group, since it always exists.
6656        DR_GROUP_GAP is the number of steps in elements from the previous
6657        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6658        correspond to the gaps.  */
6659       if (next_stmt_info != first_stmt_info
6660           && gap_count < DR_GROUP_GAP (next_stmt_info))
6661         {
6662           gap_count++;
6663           continue;
6664         }
6665
6666       /* ???  The following needs cleanup after the removal of
6667          DR_GROUP_SAME_DR_STMT.  */
6668       if (next_stmt_info)
6669         {
6670           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6671           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6672              copies, and we put the new vector statement last.  */
6673           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6674
6675           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6676           gap_count = 1;
6677         }
6678     }
6679 }
6680
6681 /* Function vect_force_dr_alignment_p.
6682
6683    Returns whether the alignment of a DECL can be forced to be aligned
6684    on ALIGNMENT bit boundary.  */
6685
6686 bool
6687 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6688 {
6689   if (!VAR_P (decl))
6690     return false;
6691
6692   if (decl_in_symtab_p (decl)
6693       && !symtab_node::get (decl)->can_increase_alignment_p ())
6694     return false;
6695
6696   if (TREE_STATIC (decl))
6697     return (known_le (alignment,
6698                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6699   else
6700     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6701 }
6702
6703 /* Return whether the data reference DR_INFO is supported with respect to its
6704    alignment.
6705    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6706    it is aligned, i.e., check if it is possible to vectorize it with different
6707    alignment.  */
6708
6709 enum dr_alignment_support
6710 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6711                                tree vectype, int misalignment)
6712 {
6713   data_reference *dr = dr_info->dr;
6714   stmt_vec_info stmt_info = dr_info->stmt;
6715   machine_mode mode = TYPE_MODE (vectype);
6716   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6717   class loop *vect_loop = NULL;
6718   bool nested_in_vect_loop = false;
6719
6720   if (misalignment == 0)
6721     return dr_aligned;
6722
6723   /* For now assume all conditional loads/stores support unaligned
6724      access without any special code.  */
6725   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6726     if (gimple_call_internal_p (stmt)
6727         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6728             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6729       return dr_unaligned_supported;
6730
6731   if (loop_vinfo)
6732     {
6733       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6734       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6735     }
6736
6737   /* Possibly unaligned access.  */
6738
6739   /* We can choose between using the implicit realignment scheme (generating
6740      a misaligned_move stmt) and the explicit realignment scheme (generating
6741      aligned loads with a REALIGN_LOAD).  There are two variants to the
6742      explicit realignment scheme: optimized, and unoptimized.
6743      We can optimize the realignment only if the step between consecutive
6744      vector loads is equal to the vector size.  Since the vector memory
6745      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6746      is guaranteed that the misalignment amount remains the same throughout the
6747      execution of the vectorized loop.  Therefore, we can create the
6748      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6749      at the loop preheader.
6750
6751      However, in the case of outer-loop vectorization, when vectorizing a
6752      memory access in the inner-loop nested within the LOOP that is now being
6753      vectorized, while it is guaranteed that the misalignment of the
6754      vectorized memory access will remain the same in different outer-loop
6755      iterations, it is *not* guaranteed that is will remain the same throughout
6756      the execution of the inner-loop.  This is because the inner-loop advances
6757      with the original scalar step (and not in steps of VS).  If the inner-loop
6758      step happens to be a multiple of VS, then the misalignment remains fixed
6759      and we can use the optimized realignment scheme.  For example:
6760
6761       for (i=0; i<N; i++)
6762         for (j=0; j<M; j++)
6763           s += a[i+j];
6764
6765      When vectorizing the i-loop in the above example, the step between
6766      consecutive vector loads is 1, and so the misalignment does not remain
6767      fixed across the execution of the inner-loop, and the realignment cannot
6768      be optimized (as illustrated in the following pseudo vectorized loop):
6769
6770       for (i=0; i<N; i+=4)
6771         for (j=0; j<M; j++){
6772           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6773                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6774                          // (assuming that we start from an aligned address).
6775           }
6776
6777      We therefore have to use the unoptimized realignment scheme:
6778
6779       for (i=0; i<N; i+=4)
6780           for (j=k; j<M; j+=4)
6781           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6782                            // that the misalignment of the initial address is
6783                            // 0).
6784
6785      The loop can then be vectorized as follows:
6786
6787       for (k=0; k<4; k++){
6788         rt = get_realignment_token (&vp[k]);
6789         for (i=0; i<N; i+=4){
6790           v1 = vp[i+k];
6791           for (j=k; j<M; j+=4){
6792             v2 = vp[i+j+VS-1];
6793             va = REALIGN_LOAD <v1,v2,rt>;
6794             vs += va;
6795             v1 = v2;
6796           }
6797         }
6798     } */
6799
6800   if (DR_IS_READ (dr))
6801     {
6802       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6803           && (!targetm.vectorize.builtin_mask_for_load
6804               || targetm.vectorize.builtin_mask_for_load ()))
6805         {
6806           /* If we are doing SLP then the accesses need not have the
6807              same alignment, instead it depends on the SLP group size.  */
6808           if (loop_vinfo
6809               && STMT_SLP_TYPE (stmt_info)
6810               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6811                               * (DR_GROUP_SIZE
6812                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6813                               TYPE_VECTOR_SUBPARTS (vectype)))
6814             ;
6815           else if (!loop_vinfo
6816                    || (nested_in_vect_loop
6817                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6818                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6819             return dr_explicit_realign;
6820           else
6821             return dr_explicit_realign_optimized;
6822         }
6823     }
6824
6825   bool is_packed = false;
6826   tree type = TREE_TYPE (DR_REF (dr));
6827   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6828     is_packed = not_size_aligned (DR_REF (dr));
6829   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6830                                                      is_packed))
6831     return dr_unaligned_supported;
6832
6833   /* Unsupported.  */
6834   return dr_unaligned_unsupported;
6835 }