gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 120 {
 121   HOST_WIDE_INT lhs, rhs;
 122
 123   /* During the analysis phase, this function is called on arbitrary
 124      statements that might not have scalar results.  */
 125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 126     return scalar_type;
 127
 128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 129
 130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 131   if (assign)
 132     {
 133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 134       if (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
 140           || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
 141           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 142         {
 143           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 144
 145           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 146           if (rhs < lhs)
 147             scalar_type = rhs_type;
 148         }
 149     }
 150   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 151     {
 152       unsigned int i = 0;
 153       if (gimple_call_internal_p (call))
 154         {
 155           internal_fn ifn = gimple_call_internal_fn (call);
 156           if (internal_load_fn_p (ifn))
 157             /* For loads the LHS type does the trick.  */
 158             i = ~0U;
 159           else if (internal_store_fn_p (ifn))
 160             {
 161               /* For stores use the tyep of the stored value.  */
 162               i = internal_fn_stored_value_index (ifn);
 163               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 164               i = ~0U;
 165             }
 166           else if (internal_fn_mask_index (ifn) == 0)
 167             i = 1;
 168         }
 169       if (i < gimple_call_num_args (call))
 170         {
 171           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 172           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 173             {
 174               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 175               if (rhs < lhs)
 176                 scalar_type = rhs_type;
 177             }
 178         }
 179     }
 180
 181   return scalar_type;
 182 }
 183
 184
 185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 186    tested at run-time.  Return TRUE if DDR was successfully inserted.
 187    Return false if versioning is not supported.  */
 188
 189 static opt_result
 190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 191 {
 192   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 193
 194   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 195     return opt_result::failure_at (vect_location,
 196                                    "will not create alias checks, as"
 197                                    " --param vect-max-version-for-alias-checks"
 198                                    " == 0\n");
 199
 200   opt_result res
 201     = runtime_alias_check_p (ddr, loop,
 202                              optimize_loop_nest_for_speed_p (loop));
 203   if (!res)
 204     return res;
 205
 206   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 207   return opt_result::success ();
 208 }
 209
 210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 211
 212 static void
 213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 214 {
 215   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 216   for (unsigned int i = 0; i < checks.length(); ++i)
 217     if (checks[i] == value)
 218       return;
 219
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location,
 222                      "need run-time check that %T is nonzero\n",
 223                      value);
 224   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 225 }
 226
 227 /* Return true if we know that the order of vectorized DR_INFO_A and
 228    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 229    DR_INFO_B.  At least one of the accesses is a write.  */
 230
 231 static bool
 232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 233 {
 234   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 235   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 236
 237   /* Single statements are always kept in their original order.  */
 238   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 239       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 240     return true;
 241
 242   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 243      emitted at the position of the first scalar load.
 244      Stores in a group are emitted at the position of the last scalar store.
 245      Compute that position and check whether the resulting order matches
 246      the current one.  */
 247   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 248   if (il_a)
 249     {
 250       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 251         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 252              s = DR_GROUP_NEXT_ELEMENT (s))
 253           il_a = get_later_stmt (il_a, s);
 254       else /* DR_IS_READ */
 255         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 256              s = DR_GROUP_NEXT_ELEMENT (s))
 257           if (get_later_stmt (il_a, s) == il_a)
 258             il_a = s;
 259     }
 260   else
 261     il_a = stmtinfo_a;
 262   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 263   if (il_b)
 264     {
 265       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 266         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 267              s = DR_GROUP_NEXT_ELEMENT (s))
 268           il_b = get_later_stmt (il_b, s);
 269       else /* DR_IS_READ */
 270         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 271              s = DR_GROUP_NEXT_ELEMENT (s))
 272           if (get_later_stmt (il_b, s) == il_b)
 273             il_b = s;
 274     }
 275   else
 276     il_b = stmtinfo_b;
 277   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 278   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 279 }
 280
 281 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 282    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 283    distances.  These distances are conservatively correct but they don't
 284    reflect a guaranteed dependence.
 285
 286    Return true if this function does all the work necessary to avoid
 287    an alias or false if the caller should use the dependence distances
 288    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 289    the depth of the loop described by LOOP_VINFO and the other arguments
 290    are as for vect_analyze_data_ref_dependence.  */
 291
 292 static bool
 293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 294                                        loop_vec_info loop_vinfo,
 295                                        int loop_depth, unsigned int *max_vf)
 296 {
 297   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 298   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 299     {
 300       int dist = dist_v[loop_depth];
 301       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 302         {
 303           /* If the user asserted safelen >= DIST consecutive iterations
 304              can be executed concurrently, assume independence.
 305
 306              ??? An alternative would be to add the alias check even
 307              in this case, and vectorize the fallback loop with the
 308              maximum VF set to safelen.  However, if the user has
 309              explicitly given a length, it's less likely that that
 310              would be a win.  */
 311           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 312             {
 313               if ((unsigned int) loop->safelen < *max_vf)
 314                 *max_vf = loop->safelen;
 315               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 316               continue;
 317             }
 318
 319           /* For dependence distances of 2 or more, we have the option
 320              of limiting VF or checking for an alias at runtime.
 321              Prefer to check at runtime if we can, to avoid limiting
 322              the VF unnecessarily when the bases are in fact independent.
 323
 324              Note that the alias checks will be removed if the VF ends up
 325              being small enough.  */
 326           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 327           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 328           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 329                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 330                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 331         }
 332     }
 333   return true;
 334 }
 335
 336
 337 /* Function vect_analyze_data_ref_dependence.
 338
 339    FIXME: I needed to change the sense of the returned flag.
 340
 341    Return FALSE if there (might) exist a dependence between a memory-reference
 342    DRA and a memory-reference DRB.  When versioning for alias may check a
 343    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 344    the data dependence.  */
 345
 346 static opt_result
 347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 348                                   loop_vec_info loop_vinfo,
 349                                   unsigned int *max_vf)
 350 {
 351   unsigned int i;
 352   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 353   struct data_reference *dra = DDR_A (ddr);
 354   struct data_reference *drb = DDR_B (ddr);
 355   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 356   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 357   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 358   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 359   lambda_vector dist_v;
 360   unsigned int loop_depth;
 361
 362   /* If user asserted safelen consecutive iterations can be
 363      executed concurrently, assume independence.  */
 364   auto apply_safelen = [&]()
 365     {
 366       if (loop->safelen >= 2)
 367         {
 368           if ((unsigned int) loop->safelen < *max_vf)
 369             *max_vf = loop->safelen;
 370           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 371           return true;
 372         }
 373       return false;
 374     };
 375
 376   /* In loop analysis all data references should be vectorizable.  */
 377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 379     gcc_unreachable ();
 380
 381   /* Independent data accesses.  */
 382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 383     return opt_result::success ();
 384
 385   if (dra == drb
 386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 387     return opt_result::success ();
 388
 389   /* We do not have to consider dependences between accesses that belong
 390      to the same group, unless the stride could be smaller than the
 391      group size.  */
 392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 394           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 396     return opt_result::success ();
 397
 398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 399      least two scalar iterations, there is always also a true dependence.
 400      As the vectorizer does not re-order loads and stores we can ignore
 401      the anti-dependence if TBAA can disambiguate both DRs similar to the
 402      case with known negative distance anti-dependences (positive
 403      distance anti-dependences would violate TBAA constraints).  */
 404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 407                                  get_alias_set (DR_REF (drb))))
 408     return opt_result::success ();
 409
 410   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 411       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 412     {
 413       if (apply_safelen ())
 414         return opt_result::success ();
 415
 416       return opt_result::failure_at
 417         (stmtinfo_a->stmt,
 418          "possible alias involving gather/scatter between %T and %T\n",
 419          DR_REF (dra), DR_REF (drb));
 420     }
 421
 422   /* Unknown data dependence.  */
 423   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 424     {
 425       if (apply_safelen ())
 426         return opt_result::success ();
 427
 428       if (dump_enabled_p ())
 429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 430                          "versioning for alias required: "
 431                          "can't determine dependence between %T and %T\n",
 432                          DR_REF (dra), DR_REF (drb));
 433
 434       /* Add to list of ddrs that need to be tested at run-time.  */
 435       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 436     }
 437
 438   /* Known data dependence.  */
 439   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 440     {
 441       if (apply_safelen ())
 442         return opt_result::success ();
 443
 444       if (dump_enabled_p ())
 445         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 446                          "versioning for alias required: "
 447                          "bad dist vector for %T and %T\n",
 448                          DR_REF (dra), DR_REF (drb));
 449       /* Add to list of ddrs that need to be tested at run-time.  */
 450       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 451     }
 452
 453   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 454
 455   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 456       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 457                                                 loop_depth, max_vf))
 458     return opt_result::success ();
 459
 460   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 461     {
 462       int dist = dist_v[loop_depth];
 463
 464       if (dump_enabled_p ())
 465         dump_printf_loc (MSG_NOTE, vect_location,
 466                          "dependence distance  = %d.\n", dist);
 467
 468       if (dist == 0)
 469         {
 470           if (dump_enabled_p ())
 471             dump_printf_loc (MSG_NOTE, vect_location,
 472                              "dependence distance == 0 between %T and %T\n",
 473                              DR_REF (dra), DR_REF (drb));
 474
 475           /* When we perform grouped accesses and perform implicit CSE
 476              by detecting equal accesses and doing disambiguation with
 477              runtime alias tests like for
 478                 .. = a[i];
 479                 .. = a[i+1];
 480                 a[i] = ..;
 481                 a[i+1] = ..;
 482                 *p = ..;
 483                 .. = a[i];
 484                 .. = a[i+1];
 485              where we will end up loading { a[i], a[i+1] } once, make
 486              sure that inserting group loads before the first load and
 487              stores after the last store will do the right thing.
 488              Similar for groups like
 489                 a[i] = ...;
 490                 ... = a[i];
 491                 a[i+1] = ...;
 492              where loads from the group interleave with the store.  */
 493           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 494             return opt_result::failure_at (stmtinfo_a->stmt,
 495                                            "READ_WRITE dependence"
 496                                            " in interleaving.\n");
 497
 498           if (loop->safelen < 2)
 499             {
 500               tree indicator = dr_zero_step_indicator (dra);
 501               if (!indicator || integer_zerop (indicator))
 502                 return opt_result::failure_at (stmtinfo_a->stmt,
 503                                                "access also has a zero step\n");
 504               else if (TREE_CODE (indicator) != INTEGER_CST)
 505                 vect_check_nonzero_value (loop_vinfo, indicator);
 506             }
 507           continue;
 508         }
 509
 510       if (dist > 0 && DDR_REVERSED_P (ddr))
 511         {
 512           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 513              reversed (to make distance vector positive), and the actual
 514              distance is negative.  */
 515           if (dump_enabled_p ())
 516             dump_printf_loc (MSG_NOTE, vect_location,
 517                              "dependence distance negative.\n");
 518           /* When doing outer loop vectorization, we need to check if there is
 519              a backward dependence at the inner loop level if the dependence
 520              at the outer loop is reversed.  See PR81740.  */
 521           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 522               || nested_in_vect_loop_p (loop, stmtinfo_b))
 523             {
 524               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 525                                                          DDR_LOOP_NEST (ddr));
 526               if (dist_v[inner_depth] < 0)
 527                 return opt_result::failure_at (stmtinfo_a->stmt,
 528                                                "not vectorized, dependence "
 529                                                "between data-refs %T and %T\n",
 530                                                DR_REF (dra), DR_REF (drb));
 531             }
 532           /* Record a negative dependence distance to later limit the
 533              amount of stmt copying / unrolling we can perform.
 534              Only need to handle read-after-write dependence.  */
 535           if (DR_IS_READ (drb)
 536               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 537                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 538             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 539           continue;
 540         }
 541
 542       unsigned int abs_dist = abs (dist);
 543       if (abs_dist >= 2 && abs_dist < *max_vf)
 544         {
 545           /* The dependence distance requires reduction of the maximal
 546              vectorization factor.  */
 547           *max_vf = abs_dist;
 548           if (dump_enabled_p ())
 549             dump_printf_loc (MSG_NOTE, vect_location,
 550                              "adjusting maximal vectorization factor to %i\n",
 551                              *max_vf);
 552         }
 553
 554       if (abs_dist >= *max_vf)
 555         {
 556           /* Dependence distance does not create dependence, as far as
 557              vectorization is concerned, in this case.  */
 558           if (dump_enabled_p ())
 559             dump_printf_loc (MSG_NOTE, vect_location,
 560                              "dependence distance >= VF.\n");
 561           continue;
 562         }
 563
 564       return opt_result::failure_at (stmtinfo_a->stmt,
 565                                      "not vectorized, possible dependence "
 566                                      "between data-refs %T and %T\n",
 567                                      DR_REF (dra), DR_REF (drb));
 568     }
 569
 570   return opt_result::success ();
 571 }
 572
 573 /* Function vect_analyze_data_ref_dependences.
 574
 575    Examine all the data references in the loop, and make sure there do not
 576    exist any data dependences between them.  Set *MAX_VF according to
 577    the maximum vectorization factor the data dependences allow.  */
 578
 579 opt_result
 580 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 581                                    unsigned int *max_vf)
 582 {
 583   unsigned int i;
 584   struct data_dependence_relation *ddr;
 585
 586   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 587
 588   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 589     {
 590       LOOP_VINFO_DDRS (loop_vinfo)
 591         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 592                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 593       /* We do not need read-read dependences.  */
 594       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 595                                           &LOOP_VINFO_DDRS (loop_vinfo),
 596                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 597                                           false);
 598       gcc_assert (res);
 599     }
 600
 601   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 602
 603   /* For epilogues we either have no aliases or alias versioning
 604      was applied to original loop.  Therefore we may just get max_vf
 605      using VF of original loop.  */
 606   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 607     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 608   else
 609     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 610       {
 611         opt_result res
 612           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 613         if (!res)
 614           return res;
 615       }
 616
 617   return opt_result::success ();
 618 }
 619
 620
 621 /* Function vect_slp_analyze_data_ref_dependence.
 622
 623    Return TRUE if there (might) exist a dependence between a memory-reference
 624    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 625    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 626    according to the data dependence.  */
 627
 628 static bool
 629 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 630                                       struct data_dependence_relation *ddr)
 631 {
 632   struct data_reference *dra = DDR_A (ddr);
 633   struct data_reference *drb = DDR_B (ddr);
 634   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 635   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 636
 637   /* We need to check dependences of statements marked as unvectorizable
 638      as well, they still can prohibit vectorization.  */
 639
 640   /* Independent data accesses.  */
 641   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 642     return false;
 643
 644   if (dra == drb)
 645     return false;
 646
 647   /* Read-read is OK.  */
 648   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 649     return false;
 650
 651   /* If dra and drb are part of the same interleaving chain consider
 652      them independent.  */
 653   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 654       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 655           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 656     return false;
 657
 658   /* Unknown data dependence.  */
 659   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 660     {
 661       if  (dump_enabled_p ())
 662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 663                          "can't determine dependence between %T and %T\n",
 664                          DR_REF (dra), DR_REF (drb));
 665     }
 666   else if (dump_enabled_p ())
 667     dump_printf_loc (MSG_NOTE, vect_location,
 668                      "determined dependence between %T and %T\n",
 669                      DR_REF (dra), DR_REF (drb));
 670
 671   return true;
 672 }
 673
 674
 675 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 676    contain the vector of scalar stores of this instance if we are
 677    disambiguating the loads.  */
 678
 679 static bool
 680 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 681                                    vec<stmt_vec_info> stores,
 682                                    stmt_vec_info last_store_info)
 683 {
 684   /* This walks over all stmts involved in the SLP load/store done
 685      in NODE verifying we can sink them up to the last stmt in the
 686      group.  */
 687   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
 688     {
 689       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 690       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 691         {
 692           stmt_vec_info access_info
 693             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 694           if (access_info == last_access_info)
 695             continue;
 696           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 697           ao_ref ref;
 698           bool ref_initialized_p = false;
 699           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 700                gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 701             {
 702               gimple *stmt = gsi_stmt (gsi);
 703               if (! gimple_vuse (stmt))
 704                 continue;
 705
 706               /* If we couldn't record a (single) data reference for this
 707                  stmt we have to resort to the alias oracle.  */
 708               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 709               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 710               if (!dr_b)
 711                 {
 712                   /* We are moving a store - this means
 713                      we cannot use TBAA for disambiguation.  */
 714                   if (!ref_initialized_p)
 715                     ao_ref_init (&ref, DR_REF (dr_a));
 716                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 717                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 718                     return false;
 719                   continue;
 720                 }
 721
 722               bool dependent = false;
 723               /* If we run into a store of this same instance (we've just
 724                  marked those) then delay dependence checking until we run
 725                  into the last store because this is where it will have
 726                  been sunk to (and we verify if we can do that as well).  */
 727               if (gimple_visited_p (stmt))
 728                 {
 729                   if (stmt_info != last_store_info)
 730                     continue;
 731
 732                   for (stmt_vec_info &store_info : stores)
 733                     {
 734                       data_reference *store_dr
 735                         = STMT_VINFO_DATA_REF (store_info);
 736                       ddr_p ddr = initialize_data_dependence_relation
 737                                     (dr_a, store_dr, vNULL);
 738                       dependent
 739                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 740                       free_dependence_relation (ddr);
 741                       if (dependent)
 742                         break;
 743                     }
 744                 }
 745               else
 746                 {
 747                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 748                                                                    dr_b, vNULL);
 749                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 750                   free_dependence_relation (ddr);
 751                 }
 752               if (dependent)
 753                 return false;
 754             }
 755         }
 756     }
 757   else /* DR_IS_READ */
 758     {
 759       stmt_vec_info first_access_info
 760         = vect_find_first_scalar_stmt_in_slp (node);
 761       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 762         {
 763           stmt_vec_info access_info
 764             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 765           if (access_info == first_access_info)
 766             continue;
 767           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 768           ao_ref ref;
 769           bool ref_initialized_p = false;
 770           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 771                gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 772             {
 773               gimple *stmt = gsi_stmt (gsi);
 774               if (! gimple_vdef (stmt))
 775                 continue;
 776
 777               /* If we couldn't record a (single) data reference for this
 778                  stmt we have to resort to the alias oracle.  */
 779               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 780               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 781
 782               /* We are hoisting a load - this means we can use
 783                  TBAA for disambiguation.  */
 784               if (!ref_initialized_p)
 785                 ao_ref_init (&ref, DR_REF (dr_a));
 786               if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
 787                 {
 788                   if (!dr_b)
 789                     return false;
 790                   /* Resort to dependence checking below.  */
 791                 }
 792               else
 793                 /* No dependence.  */
 794                 continue;
 795
 796               bool dependent = false;
 797               /* If we run into a store of this same instance (we've just
 798                  marked those) then delay dependence checking until we run
 799                  into the last store because this is where it will have
 800                  been sunk to (and we verify if we can do that as well).  */
 801               if (gimple_visited_p (stmt))
 802                 {
 803                   if (stmt_info != last_store_info)
 804                     continue;
 805
 806                   for (stmt_vec_info &store_info : stores)
 807                     {
 808                       data_reference *store_dr
 809                         = STMT_VINFO_DATA_REF (store_info);
 810                       ddr_p ddr = initialize_data_dependence_relation
 811                                     (dr_a, store_dr, vNULL);
 812                       dependent
 813                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 814                       free_dependence_relation (ddr);
 815                       if (dependent)
 816                         break;
 817                     }
 818                 }
 819               else
 820                 {
 821                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 822                                                                    dr_b, vNULL);
 823                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 824                   free_dependence_relation (ddr);
 825                 }
 826               if (dependent)
 827                 return false;
 828             }
 829         }
 830     }
 831   return true;
 832 }
 833
 834
 835 /* Function vect_analyze_data_ref_dependences.
 836
 837    Examine all the data references in the basic-block, and make sure there
 838    do not exist any data dependences between them.  Set *MAX_VF according to
 839    the maximum vectorization factor the data dependences allow.  */
 840
 841 bool
 842 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 843 {
 844   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 845
 846   /* The stores of this instance are at the root of the SLP tree.  */
 847   slp_tree store = NULL;
 848   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
 849     store = SLP_INSTANCE_TREE (instance);
 850
 851   /* Verify we can sink stores to the vectorized stmt insert location.  */
 852   stmt_vec_info last_store_info = NULL;
 853   if (store)
 854     {
 855       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 856         return false;
 857
 858       /* Mark stores in this instance and remember the last one.  */
 859       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 860       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 861         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 862     }
 863
 864   bool res = true;
 865
 866   /* Verify we can sink loads to the vectorized stmt insert location,
 867      special-casing stores of this instance.  */
 868   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
 869     if (! vect_slp_analyze_node_dependences (vinfo, load,
 870                                              store
 871                                              ? SLP_TREE_SCALAR_STMTS (store)
 872                                              : vNULL, last_store_info))
 873       {
 874         res = false;
 875         break;
 876       }
 877
 878   /* Unset the visited flag.  */
 879   if (store)
 880     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 881       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 882
 883   return res;
 884 }
 885
 886 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
 887    applied.  */
 888
 889 int
 890 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
 891 {
 892   HOST_WIDE_INT diff = 0;
 893   /* Alignment is only analyzed for the first element of a DR group,
 894      use that but adjust misalignment by the offset of the access.  */
 895   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
 896     {
 897       dr_vec_info *first_dr
 898         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
 899       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
 900          INTEGER_CSTs and the first element in the group has the lowest
 901          address.  */
 902       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
 903               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
 904       gcc_assert (diff >= 0);
 905       dr_info = first_dr;
 906     }
 907
 908   int misalign = dr_info->misalignment;
 909   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
 910   if (misalign == DR_MISALIGNMENT_UNKNOWN)
 911     return misalign;
 912
 913   /* If the access is only aligned for a vector type with smaller alignment
 914      requirement the access has unknown misalignment.  */
 915   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
 916                 targetm.vectorize.preferred_vector_alignment (vectype)))
 917     return DR_MISALIGNMENT_UNKNOWN;
 918
 919   /* Apply the offset from the DR group start and the externally supplied
 920      offset which can for example result from a negative stride access.  */
 921   poly_int64 misalignment = misalign + diff + offset;
 922
 923   /* vect_compute_data_ref_alignment will have ensured that target_alignment
 924      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
 925   unsigned HOST_WIDE_INT target_alignment_c
 926     = dr_info->target_alignment.to_constant ();
 927   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
 928     return DR_MISALIGNMENT_UNKNOWN;
 929   return misalign;
 930 }
 931
 932 /* Record the base alignment guarantee given by DRB, which occurs
 933    in STMT_INFO.  */
 934
 935 static void
 936 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 937                             innermost_loop_behavior *drb)
 938 {
 939   bool existed;
 940   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
 941     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 942   if (!existed || entry.second->base_alignment < drb->base_alignment)
 943     {
 944       entry = std::make_pair (stmt_info, drb);
 945       if (dump_enabled_p ())
 946         dump_printf_loc (MSG_NOTE, vect_location,
 947                          "recording new base alignment for %T\n"
 948                          "  alignment:    %d\n"
 949                          "  misalignment: %d\n"
 950                          "  based on:     %G",
 951                          drb->base_address,
 952                          drb->base_alignment,
 953                          drb->base_misalignment,
 954                          stmt_info->stmt);
 955     }
 956 }
 957
 958 /* If the region we're going to vectorize is reached, all unconditional
 959    data references occur at least once.  We can therefore pool the base
 960    alignment guarantees from each unconditional reference.  Do this by
 961    going through all the data references in VINFO and checking whether
 962    the containing statement makes the reference unconditionally.  If so,
 963    record the alignment of the base address in VINFO so that it can be
 964    used for all other references with the same base.  */
 965
 966 void
 967 vect_record_base_alignments (vec_info *vinfo)
 968 {
 969   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 970   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 971   for (data_reference *dr : vinfo->shared->datarefs)
 972     {
 973       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 974       stmt_vec_info stmt_info = dr_info->stmt;
 975       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 976           && STMT_VINFO_VECTORIZABLE (stmt_info)
 977           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 978         {
 979           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 980
 981           /* If DR is nested in the loop that is being vectorized, we can also
 982              record the alignment of the base wrt the outer loop.  */
 983           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 984             vect_record_base_alignment
 985               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 986         }
 987     }
 988 }
 989
 990 /* Function vect_compute_data_ref_alignment
 991
 992    Compute the misalignment of the data reference DR_INFO when vectorizing
 993    with VECTYPE.
 994
 995    Output:
 996    1. initialized misalignment info for DR_INFO
 997
 998    FOR NOW: No analysis is actually performed. Misalignment is calculated
 999    only for trivial cases. TODO.  */
1000
1001 static void
1002 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1003                                  tree vectype)
1004 {
1005   stmt_vec_info stmt_info = dr_info->stmt;
1006   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1007   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1008   class loop *loop = NULL;
1009   tree ref = DR_REF (dr_info->dr);
1010
1011   if (dump_enabled_p ())
1012     dump_printf_loc (MSG_NOTE, vect_location,
1013                      "vect_compute_data_ref_alignment:\n");
1014
1015   if (loop_vinfo)
1016     loop = LOOP_VINFO_LOOP (loop_vinfo);
1017
1018   /* Initialize misalignment to unknown.  */
1019   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1020
1021   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1022     return;
1023
1024   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1025   bool step_preserves_misalignment_p;
1026
1027   poly_uint64 vector_alignment
1028     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1029                  BITS_PER_UNIT);
1030   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1031
1032   /* If the main loop has peeled for alignment we have no way of knowing
1033      whether the data accesses in the epilogues are aligned.  We can't at
1034      compile time answer the question whether we have entered the main loop or
1035      not.  Fixes PR 92351.  */
1036   if (loop_vinfo)
1037     {
1038       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1039       if (orig_loop_vinfo
1040           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1041         return;
1042     }
1043
1044   unsigned HOST_WIDE_INT vect_align_c;
1045   if (!vector_alignment.is_constant (&vect_align_c))
1046     return;
1047
1048   /* No step for BB vectorization.  */
1049   if (!loop)
1050     {
1051       gcc_assert (integer_zerop (drb->step));
1052       step_preserves_misalignment_p = true;
1053     }
1054
1055   /* In case the dataref is in an inner-loop of the loop that is being
1056      vectorized (LOOP), we use the base and misalignment information
1057      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1058      stays the same throughout the execution of the inner-loop, which is why
1059      we have to check that the stride of the dataref in the inner-loop evenly
1060      divides by the vector alignment.  */
1061   else if (nested_in_vect_loop_p (loop, stmt_info))
1062     {
1063       step_preserves_misalignment_p
1064         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1065
1066       if (dump_enabled_p ())
1067         {
1068           if (step_preserves_misalignment_p)
1069             dump_printf_loc (MSG_NOTE, vect_location,
1070                              "inner step divides the vector alignment.\n");
1071           else
1072             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1073                              "inner step doesn't divide the vector"
1074                              " alignment.\n");
1075         }
1076     }
1077
1078   /* Similarly we can only use base and misalignment information relative to
1079      an innermost loop if the misalignment stays the same throughout the
1080      execution of the loop.  As above, this is the case if the stride of
1081      the dataref evenly divides by the alignment.  */
1082   else
1083     {
1084       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1085       step_preserves_misalignment_p
1086         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1087
1088       if (!step_preserves_misalignment_p && dump_enabled_p ())
1089         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1090                          "step doesn't divide the vector alignment.\n");
1091     }
1092
1093   unsigned int base_alignment = drb->base_alignment;
1094   unsigned int base_misalignment = drb->base_misalignment;
1095
1096   /* Calculate the maximum of the pooled base address alignment and the
1097      alignment that we can compute for DR itself.  */
1098   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1099     = base_alignments->get (drb->base_address);
1100   if (entry
1101       && base_alignment < (*entry).second->base_alignment
1102       && (loop_vinfo
1103           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1104                               gimple_bb (entry->first->stmt))
1105               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1106                   || (entry->first->dr_aux.group <= dr_info->group)))))
1107     {
1108       base_alignment = entry->second->base_alignment;
1109       base_misalignment = entry->second->base_misalignment;
1110     }
1111
1112   if (drb->offset_alignment < vect_align_c
1113       || !step_preserves_misalignment_p
1114       /* We need to know whether the step wrt the vectorized loop is
1115          negative when computing the starting misalignment below.  */
1116       || TREE_CODE (drb->step) != INTEGER_CST)
1117     {
1118       if (dump_enabled_p ())
1119         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1120                          "Unknown alignment for access: %T\n", ref);
1121       return;
1122     }
1123
1124   if (base_alignment < vect_align_c)
1125     {
1126       unsigned int max_alignment;
1127       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1128       if (max_alignment < vect_align_c
1129           || !vect_can_force_dr_alignment_p (base,
1130                                              vect_align_c * BITS_PER_UNIT))
1131         {
1132           if (dump_enabled_p ())
1133             dump_printf_loc (MSG_NOTE, vect_location,
1134                              "can't force alignment of ref: %T\n", ref);
1135           return;
1136         }
1137
1138       /* Force the alignment of the decl.
1139          NOTE: This is the only change to the code we make during
1140          the analysis phase, before deciding to vectorize the loop.  */
1141       if (dump_enabled_p ())
1142         dump_printf_loc (MSG_NOTE, vect_location,
1143                          "force alignment of %T\n", ref);
1144
1145       dr_info->base_decl = base;
1146       dr_info->base_misaligned = true;
1147       base_misalignment = 0;
1148     }
1149   poly_int64 misalignment
1150     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1151
1152   unsigned int const_misalignment;
1153   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1154     {
1155       if (dump_enabled_p ())
1156         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157                          "Non-constant misalignment for access: %T\n", ref);
1158       return;
1159     }
1160
1161   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1162
1163   if (dump_enabled_p ())
1164     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165                      "misalign = %d bytes of ref %T\n",
1166                      const_misalignment, ref);
1167
1168   return;
1169 }
1170
1171 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1172    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1173    is made aligned via peeling.  */
1174
1175 static bool
1176 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1177                                          dr_vec_info *dr_peel_info)
1178 {
1179   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1180                   DR_TARGET_ALIGNMENT (dr_info)))
1181     {
1182       poly_offset_int diff
1183         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1184            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1185       if (known_eq (diff, 0)
1186           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1187         return true;
1188     }
1189   return false;
1190 }
1191
1192 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1193    aligned via peeling.  */
1194
1195 static bool
1196 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1197                                  dr_vec_info *dr_peel_info)
1198 {
1199   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1200                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1201       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1202                            DR_OFFSET (dr_peel_info->dr), 0)
1203       || !operand_equal_p (DR_STEP (dr_info->dr),
1204                            DR_STEP (dr_peel_info->dr), 0))
1205     return false;
1206
1207   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1208 }
1209
1210 /* Compute the value for dr_info->misalign so that the access appears
1211    aligned.  This is used by peeling to compensate for dr_misalignment
1212    applying the offset for negative step.  */
1213
1214 int
1215 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1216 {
1217   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1218     return 0;
1219
1220   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1221   poly_int64 misalignment
1222     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1223        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1224
1225   unsigned HOST_WIDE_INT target_alignment_c;
1226   int misalign;
1227   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1228       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1229     return DR_MISALIGNMENT_UNKNOWN;
1230   return misalign;
1231 }
1232
1233 /* Function vect_update_misalignment_for_peel.
1234    Sets DR_INFO's misalignment
1235    - to 0 if it has the same alignment as DR_PEEL_INFO,
1236    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1237    - to -1 (unknown) otherwise.
1238
1239    DR_INFO - the data reference whose misalignment is to be adjusted.
1240    DR_PEEL_INFO - the data reference whose misalignment is being made
1241                   zero in the vector loop by the peel.
1242    NPEEL - the number of iterations in the peel loop if the misalignment
1243            of DR_PEEL_INFO is known at compile time.  */
1244
1245 static void
1246 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1247                                    dr_vec_info *dr_peel_info, int npeel)
1248 {
1249   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1250   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1251     {
1252       SET_DR_MISALIGNMENT (dr_info,
1253                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1254       return;
1255     }
1256
1257   unsigned HOST_WIDE_INT alignment;
1258   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1259       && known_alignment_for_access_p (dr_info,
1260                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1261       && known_alignment_for_access_p (dr_peel_info,
1262                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1263     {
1264       int misal = dr_info->misalignment;
1265       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1266       misal &= alignment - 1;
1267       set_dr_misalignment (dr_info, misal);
1268       return;
1269     }
1270
1271   if (dump_enabled_p ())
1272     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1273                      "to unknown (-1).\n");
1274   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1275 }
1276
1277 /* Return true if alignment is relevant for DR_INFO.  */
1278
1279 static bool
1280 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1281 {
1282   stmt_vec_info stmt_info = dr_info->stmt;
1283
1284   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1285     return false;
1286
1287   /* For interleaving, only the alignment of the first access matters.  */
1288   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1289       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1290     return false;
1291
1292   /* Scatter-gather and invariant accesses continue to address individual
1293      scalars, so vector-level alignment is irrelevant.  */
1294   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1295       || integer_zerop (DR_STEP (dr_info->dr)))
1296     return false;
1297
1298   /* Strided accesses perform only component accesses, alignment is
1299      irrelevant for them.  */
1300   if (STMT_VINFO_STRIDED_P (stmt_info)
1301       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1302     return false;
1303
1304   return true;
1305 }
1306
1307 /* Given an memory reference EXP return whether its alignment is less
1308    than its size.  */
1309
1310 static bool
1311 not_size_aligned (tree exp)
1312 {
1313   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1314     return true;
1315
1316   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1317           > get_object_alignment (exp));
1318 }
1319
1320 /* Function vector_alignment_reachable_p
1321
1322    Return true if vector alignment for DR_INFO is reachable by peeling
1323    a few loop iterations.  Return false otherwise.  */
1324
1325 static bool
1326 vector_alignment_reachable_p (dr_vec_info *dr_info)
1327 {
1328   stmt_vec_info stmt_info = dr_info->stmt;
1329   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1330
1331   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1332     {
1333       /* For interleaved access we peel only if number of iterations in
1334          the prolog loop ({VF - misalignment}), is a multiple of the
1335          number of the interleaved accesses.  */
1336       int elem_size, mis_in_elements;
1337
1338       /* FORNOW: handle only known alignment.  */
1339       if (!known_alignment_for_access_p (dr_info, vectype))
1340         return false;
1341
1342       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1343       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1344       elem_size = vector_element_size (vector_size, nelements);
1345       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1346
1347       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1348         return false;
1349     }
1350
1351   /* If misalignment is known at the compile time then allow peeling
1352      only if natural alignment is reachable through peeling.  */
1353   if (known_alignment_for_access_p (dr_info, vectype)
1354       && !aligned_access_p (dr_info, vectype))
1355     {
1356       HOST_WIDE_INT elmsize =
1357                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1358       if (dump_enabled_p ())
1359         {
1360           dump_printf_loc (MSG_NOTE, vect_location,
1361                            "data size = %wd. misalignment = %d.\n", elmsize,
1362                            dr_misalignment (dr_info, vectype));
1363         }
1364       if (dr_misalignment (dr_info, vectype) % elmsize)
1365         {
1366           if (dump_enabled_p ())
1367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1368                              "data size does not divide the misalignment.\n");
1369           return false;
1370         }
1371     }
1372
1373   if (!known_alignment_for_access_p (dr_info, vectype))
1374     {
1375       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1376       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1377       if (dump_enabled_p ())
1378         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1379                          "Unknown misalignment, %snaturally aligned\n",
1380                          is_packed ? "not " : "");
1381       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1382     }
1383
1384   return true;
1385 }
1386
1387
1388 /* Calculate the cost of the memory access represented by DR_INFO.  */
1389
1390 static void
1391 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1392                            dr_alignment_support alignment_support_scheme,
1393                            int misalignment,
1394                            unsigned int *inside_cost,
1395                            unsigned int *outside_cost,
1396                            stmt_vector_for_cost *body_cost_vec,
1397                            stmt_vector_for_cost *prologue_cost_vec)
1398 {
1399   stmt_vec_info stmt_info = dr_info->stmt;
1400   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1401   int ncopies;
1402
1403   if (PURE_SLP_STMT (stmt_info))
1404     ncopies = 1;
1405   else
1406     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1407
1408   if (DR_IS_READ (dr_info->dr))
1409     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1410                         misalignment, true, inside_cost,
1411                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1412   else
1413     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1414                          misalignment, inside_cost, body_cost_vec);
1415
1416   if (dump_enabled_p ())
1417     dump_printf_loc (MSG_NOTE, vect_location,
1418                      "vect_get_data_access_cost: inside_cost = %d, "
1419                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1420 }
1421
1422
1423 typedef struct _vect_peel_info
1424 {
1425   dr_vec_info *dr_info;
1426   int npeel;
1427   unsigned int count;
1428 } *vect_peel_info;
1429
1430 typedef struct _vect_peel_extended_info
1431 {
1432   vec_info *vinfo;
1433   struct _vect_peel_info peel_info;
1434   unsigned int inside_cost;
1435   unsigned int outside_cost;
1436 } *vect_peel_extended_info;
1437
1438
1439 /* Peeling hashtable helpers.  */
1440
1441 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1442 {
1443   static inline hashval_t hash (const _vect_peel_info *);
1444   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1445 };
1446
1447 inline hashval_t
1448 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1449 {
1450   return (hashval_t) peel_info->npeel;
1451 }
1452
1453 inline bool
1454 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1455 {
1456   return (a->npeel == b->npeel);
1457 }
1458
1459
1460 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1461
1462 static void
1463 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1464                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1465                           int npeel, bool supportable_if_not_aligned)
1466 {
1467   struct _vect_peel_info elem, *slot;
1468   _vect_peel_info **new_slot;
1469
1470   elem.npeel = npeel;
1471   slot = peeling_htab->find (&elem);
1472   if (slot)
1473     slot->count++;
1474   else
1475     {
1476       slot = XNEW (struct _vect_peel_info);
1477       slot->npeel = npeel;
1478       slot->dr_info = dr_info;
1479       slot->count = 1;
1480       new_slot = peeling_htab->find_slot (slot, INSERT);
1481       *new_slot = slot;
1482     }
1483
1484   /* If this DR is not supported with unknown misalignment then bias
1485      this slot when the cost model is disabled.  */
1486   if (!supportable_if_not_aligned
1487       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1488     slot->count += VECT_MAX_COST;
1489 }
1490
1491
1492 /* Traverse peeling hash table to find peeling option that aligns maximum
1493    number of data accesses.  */
1494
1495 int
1496 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1497                                      _vect_peel_extended_info *max)
1498 {
1499   vect_peel_info elem = *slot;
1500
1501   if (elem->count > max->peel_info.count
1502       || (elem->count == max->peel_info.count
1503           && max->peel_info.npeel > elem->npeel))
1504     {
1505       max->peel_info.npeel = elem->npeel;
1506       max->peel_info.count = elem->count;
1507       max->peel_info.dr_info = elem->dr_info;
1508     }
1509
1510   return 1;
1511 }
1512
1513 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1514    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1515    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1516    after peeling.  */
1517
1518 static void
1519 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1520                                 dr_vec_info *dr0_info,
1521                                 unsigned int *inside_cost,
1522                                 unsigned int *outside_cost,
1523                                 stmt_vector_for_cost *body_cost_vec,
1524                                 stmt_vector_for_cost *prologue_cost_vec,
1525                                 unsigned int npeel)
1526 {
1527   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1528
1529   bool dr0_alignment_known_p
1530     = (dr0_info
1531        && known_alignment_for_access_p (dr0_info,
1532                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1533
1534   for (data_reference *dr : datarefs)
1535     {
1536       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1537       if (!vect_relevant_for_alignment_p (dr_info))
1538         continue;
1539
1540       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1541       dr_alignment_support alignment_support_scheme;
1542       int misalignment;
1543       unsigned HOST_WIDE_INT alignment;
1544
1545       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1546                                             size_zero_node) < 0;
1547       poly_int64 off = 0;
1548       if (negative)
1549         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1550                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1551
1552       if (npeel == 0)
1553         misalignment = dr_misalignment (dr_info, vectype, off);
1554       else if (dr_info == dr0_info
1555                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1556         misalignment = 0;
1557       else if (!dr0_alignment_known_p
1558                || !known_alignment_for_access_p (dr_info, vectype)
1559                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1560         misalignment = DR_MISALIGNMENT_UNKNOWN;
1561       else
1562         {
1563           misalignment = dr_misalignment (dr_info, vectype, off);
1564           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1565           misalignment &= alignment - 1;
1566         }
1567       alignment_support_scheme
1568         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1569                                          misalignment);
1570
1571       vect_get_data_access_cost (loop_vinfo, dr_info,
1572                                  alignment_support_scheme, misalignment,
1573                                  inside_cost, outside_cost,
1574                                  body_cost_vec, prologue_cost_vec);
1575     }
1576 }
1577
1578 /* Traverse peeling hash table and calculate cost for each peeling option.
1579    Find the one with the lowest cost.  */
1580
1581 int
1582 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1583                                    _vect_peel_extended_info *min)
1584 {
1585   vect_peel_info elem = *slot;
1586   int dummy;
1587   unsigned int inside_cost = 0, outside_cost = 0;
1588   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1589   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1590                        epilogue_cost_vec;
1591
1592   prologue_cost_vec.create (2);
1593   body_cost_vec.create (2);
1594   epilogue_cost_vec.create (2);
1595
1596   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1597                                   &outside_cost, &body_cost_vec,
1598                                   &prologue_cost_vec, elem->npeel);
1599
1600   body_cost_vec.release ();
1601
1602   outside_cost += vect_get_known_peeling_cost
1603     (loop_vinfo, elem->npeel, &dummy,
1604      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1605      &prologue_cost_vec, &epilogue_cost_vec);
1606
1607   /* Prologue and epilogue costs are added to the target model later.
1608      These costs depend only on the scalar iteration cost, the
1609      number of peeling iterations finally chosen, and the number of
1610      misaligned statements.  So discard the information found here.  */
1611   prologue_cost_vec.release ();
1612   epilogue_cost_vec.release ();
1613
1614   if (inside_cost < min->inside_cost
1615       || (inside_cost == min->inside_cost
1616           && outside_cost < min->outside_cost))
1617     {
1618       min->inside_cost = inside_cost;
1619       min->outside_cost = outside_cost;
1620       min->peel_info.dr_info = elem->dr_info;
1621       min->peel_info.npeel = elem->npeel;
1622       min->peel_info.count = elem->count;
1623     }
1624
1625   return 1;
1626 }
1627
1628
1629 /* Choose best peeling option by traversing peeling hash table and either
1630    choosing an option with the lowest cost (if cost model is enabled) or the
1631    option that aligns as many accesses as possible.  */
1632
1633 static struct _vect_peel_extended_info
1634 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1635                                        loop_vec_info loop_vinfo)
1636 {
1637    struct _vect_peel_extended_info res;
1638
1639    res.peel_info.dr_info = NULL;
1640    res.vinfo = loop_vinfo;
1641
1642    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1643      {
1644        res.inside_cost = INT_MAX;
1645        res.outside_cost = INT_MAX;
1646        peeling_htab->traverse <_vect_peel_extended_info *,
1647                                vect_peeling_hash_get_lowest_cost> (&res);
1648      }
1649    else
1650      {
1651        res.peel_info.count = 0;
1652        peeling_htab->traverse <_vect_peel_extended_info *,
1653                                vect_peeling_hash_get_most_frequent> (&res);
1654        res.inside_cost = 0;
1655        res.outside_cost = 0;
1656      }
1657
1658    return res;
1659 }
1660
1661 /* Return true if the new peeling NPEEL is supported.  */
1662
1663 static bool
1664 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1665                           unsigned npeel)
1666 {
1667   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1668   enum dr_alignment_support supportable_dr_alignment;
1669
1670   bool dr0_alignment_known_p
1671     = known_alignment_for_access_p (dr0_info,
1672                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1673
1674   /* Ensure that all data refs can be vectorized after the peel.  */
1675   for (data_reference *dr : datarefs)
1676     {
1677       if (dr == dr0_info->dr)
1678         continue;
1679
1680       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1681       if (!vect_relevant_for_alignment_p (dr_info)
1682           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1683         continue;
1684
1685       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1686       int misalignment;
1687       unsigned HOST_WIDE_INT alignment;
1688       if (!dr0_alignment_known_p
1689           || !known_alignment_for_access_p (dr_info, vectype)
1690           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1691         misalignment = DR_MISALIGNMENT_UNKNOWN;
1692       else
1693         {
1694           misalignment = dr_misalignment (dr_info, vectype);
1695           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1696           misalignment &= alignment - 1;
1697         }
1698       supportable_dr_alignment
1699         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1700                                          misalignment);
1701       if (supportable_dr_alignment == dr_unaligned_unsupported)
1702         return false;
1703     }
1704
1705   return true;
1706 }
1707
1708 /* Compare two data-references DRA and DRB to group them into chunks
1709    with related alignment.  */
1710
1711 static int
1712 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1713 {
1714   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1715   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1716   int cmp;
1717
1718   /* Stabilize sort.  */
1719   if (dra == drb)
1720     return 0;
1721
1722   /* Ordering of DRs according to base.  */
1723   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1724                                DR_BASE_ADDRESS (drb));
1725   if (cmp != 0)
1726     return cmp;
1727
1728   /* And according to DR_OFFSET.  */
1729   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1730   if (cmp != 0)
1731     return cmp;
1732
1733   /* And after step.  */
1734   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1735   if (cmp != 0)
1736     return cmp;
1737
1738   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1739   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1740   if (cmp == 0)
1741     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1742   return cmp;
1743 }
1744
1745 /* Function vect_enhance_data_refs_alignment
1746
1747    This pass will use loop versioning and loop peeling in order to enhance
1748    the alignment of data references in the loop.
1749
1750    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1751    original loop is to be vectorized.  Any other loops that are created by
1752    the transformations performed in this pass - are not supposed to be
1753    vectorized.  This restriction will be relaxed.
1754
1755    This pass will require a cost model to guide it whether to apply peeling
1756    or versioning or a combination of the two.  For example, the scheme that
1757    intel uses when given a loop with several memory accesses, is as follows:
1758    choose one memory access ('p') which alignment you want to force by doing
1759    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1760    other accesses are not necessarily aligned, or (2) use loop versioning to
1761    generate one loop in which all accesses are aligned, and another loop in
1762    which only 'p' is necessarily aligned.
1763
1764    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1765    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1766    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1767
1768    Devising a cost model is the most critical aspect of this work.  It will
1769    guide us on which access to peel for, whether to use loop versioning, how
1770    many versions to create, etc.  The cost model will probably consist of
1771    generic considerations as well as target specific considerations (on
1772    powerpc for example, misaligned stores are more painful than misaligned
1773    loads).
1774
1775    Here are the general steps involved in alignment enhancements:
1776
1777      -- original loop, before alignment analysis:
1778         for (i=0; i<N; i++){
1779           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1780           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1781         }
1782
1783      -- After vect_compute_data_refs_alignment:
1784         for (i=0; i<N; i++){
1785           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1786           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1787         }
1788
1789      -- Possibility 1: we do loop versioning:
1790      if (p is aligned) {
1791         for (i=0; i<N; i++){    # loop 1A
1792           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1793           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1794         }
1795      }
1796      else {
1797         for (i=0; i<N; i++){    # loop 1B
1798           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1799           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1800         }
1801      }
1802
1803      -- Possibility 2: we do loop peeling:
1804      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1805         x = q[i];
1806         p[i] = y;
1807      }
1808      for (i = 3; i < N; i++){   # loop 2A
1809         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1810         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1811      }
1812
1813      -- Possibility 3: combination of loop peeling and versioning:
1814      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1815         x = q[i];
1816         p[i] = y;
1817      }
1818      if (p is aligned) {
1819         for (i = 3; i<N; i++){  # loop 3A
1820           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1821           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1822         }
1823      }
1824      else {
1825         for (i = 3; i<N; i++){  # loop 3B
1826           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1827           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1828         }
1829      }
1830
1831      These loops are later passed to loop_transform to be vectorized.  The
1832      vectorizer will use the alignment information to guide the transformation
1833      (whether to generate regular loads/stores, or with special handling for
1834      misalignment).  */
1835
1836 opt_result
1837 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1838 {
1839   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1840   dr_vec_info *first_store = NULL;
1841   dr_vec_info *dr0_info = NULL;
1842   struct data_reference *dr;
1843   unsigned int i;
1844   bool do_peeling = false;
1845   bool do_versioning = false;
1846   unsigned int npeel = 0;
1847   bool one_misalignment_known = false;
1848   bool one_misalignment_unknown = false;
1849   bool one_dr_unsupportable = false;
1850   dr_vec_info *unsupportable_dr_info = NULL;
1851   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1852   hash_table<peel_info_hasher> peeling_htab (1);
1853
1854   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1855
1856   /* Reset data so we can safely be called multiple times.  */
1857   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1858   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1859
1860   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1861     return opt_result::success ();
1862
1863   /* Sort the vector of datarefs so DRs that have the same or dependent
1864      alignment are next to each other.  */
1865   auto_vec<data_reference_p> datarefs
1866     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1867   datarefs.qsort (dr_align_group_sort_cmp);
1868
1869   /* Compute the number of DRs that become aligned when we peel
1870      a dataref so it becomes aligned.  */
1871   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1872   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1873   unsigned i0;
1874   for (i0 = 0; i0 < datarefs.length (); ++i0)
1875     if (DR_BASE_ADDRESS (datarefs[i0]))
1876       break;
1877   for (i = i0 + 1; i <= datarefs.length (); ++i)
1878     {
1879       if (i == datarefs.length ()
1880           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1881                                DR_BASE_ADDRESS (datarefs[i]), 0)
1882           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1883                                DR_OFFSET (datarefs[i]), 0)
1884           || !operand_equal_p (DR_STEP (datarefs[i0]),
1885                                DR_STEP (datarefs[i]), 0))
1886         {
1887           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1888              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1889              will get known misalignment if we align one of the refs
1890              with the largest DR_TARGET_ALIGNMENT.  */
1891           for (unsigned j = i0; j < i; ++j)
1892             {
1893               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1894               for (unsigned k = i0; k < i; ++k)
1895                 {
1896                   if (k == j)
1897                     continue;
1898                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1899                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1900                                                                dr_infoj))
1901                     n_same_align_refs[j]++;
1902                 }
1903             }
1904           i0 = i;
1905         }
1906     }
1907
1908   /* While cost model enhancements are expected in the future, the high level
1909      view of the code at this time is as follows:
1910
1911      A) If there is a misaligned access then see if peeling to align
1912         this access can make all data references satisfy
1913         vect_supportable_dr_alignment.  If so, update data structures
1914         as needed and return true.
1915
1916      B) If peeling wasn't possible and there is a data reference with an
1917         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1918         then see if loop versioning checks can be used to make all data
1919         references satisfy vect_supportable_dr_alignment.  If so, update
1920         data structures as needed and return true.
1921
1922      C) If neither peeling nor versioning were successful then return false if
1923         any data reference does not satisfy vect_supportable_dr_alignment.
1924
1925      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1926
1927      Note, Possibility 3 above (which is peeling and versioning together) is not
1928      being done at this time.  */
1929
1930   /* (1) Peeling to force alignment.  */
1931
1932   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1933      Considerations:
1934      + How many accesses will become aligned due to the peeling
1935      - How many accesses will become unaligned due to the peeling,
1936        and the cost of misaligned accesses.
1937      - The cost of peeling (the extra runtime checks, the increase
1938        in code size).  */
1939
1940   FOR_EACH_VEC_ELT (datarefs, i, dr)
1941     {
1942       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1943       if (!vect_relevant_for_alignment_p (dr_info))
1944         continue;
1945
1946       stmt_vec_info stmt_info = dr_info->stmt;
1947       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1948       do_peeling = vector_alignment_reachable_p (dr_info);
1949       if (do_peeling)
1950         {
1951           if (known_alignment_for_access_p (dr_info, vectype))
1952             {
1953               unsigned int npeel_tmp = 0;
1954               bool negative = tree_int_cst_compare (DR_STEP (dr),
1955                                                     size_zero_node) < 0;
1956
1957               /* If known_alignment_for_access_p then we have set
1958                  DR_MISALIGNMENT which is only done if we know it at compiler
1959                  time, so it is safe to assume target alignment is constant.
1960                */
1961               unsigned int target_align =
1962                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1963               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1964               poly_int64 off = 0;
1965               if (negative)
1966                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1967               unsigned int mis = dr_misalignment (dr_info, vectype, off);
1968               mis = negative ? mis : -mis;
1969               if (mis != 0)
1970                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1971
1972               /* For multiple types, it is possible that the bigger type access
1973                  will have more than one peeling option.  E.g., a loop with two
1974                  types: one of size (vector size / 4), and the other one of
1975                  size (vector size / 8).  Vectorization factor will 8.  If both
1976                  accesses are misaligned by 3, the first one needs one scalar
1977                  iteration to be aligned, and the second one needs 5.  But the
1978                  first one will be aligned also by peeling 5 scalar
1979                  iterations, and in that case both accesses will be aligned.
1980                  Hence, except for the immediate peeling amount, we also want
1981                  to try to add full vector size, while we don't exceed
1982                  vectorization factor.
1983                  We do this automatically for cost model, since we calculate
1984                  cost for every peeling option.  */
1985               poly_uint64 nscalars = npeel_tmp;
1986               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1987                 {
1988                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989                   nscalars = (STMT_SLP_TYPE (stmt_info)
1990                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1991                 }
1992
1993               /* Save info about DR in the hash table.  Also include peeling
1994                  amounts according to the explanation above.  Indicate
1995                  the alignment status when the ref is not aligned.
1996                  ???  Rather than using unknown alignment here we should
1997                  prune all entries from the peeling hashtable which cause
1998                  DRs to be not supported.  */
1999               bool supportable_if_not_aligned
2000                 = vect_supportable_dr_alignment
2001                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2002               while (known_le (npeel_tmp, nscalars))
2003                 {
2004                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2005                                             dr_info, npeel_tmp,
2006                                             supportable_if_not_aligned);
2007                   npeel_tmp += MAX (1, target_align / dr_size);
2008                 }
2009
2010               one_misalignment_known = true;
2011             }
2012           else
2013             {
2014               /* If we don't know any misalignment values, we prefer
2015                  peeling for data-ref that has the maximum number of data-refs
2016                  with the same alignment, unless the target prefers to align
2017                  stores over load.  */
2018               unsigned same_align_drs = n_same_align_refs[i];
2019               if (!dr0_info
2020                   || dr0_same_align_drs < same_align_drs)
2021                 {
2022                   dr0_same_align_drs = same_align_drs;
2023                   dr0_info = dr_info;
2024                 }
2025               /* For data-refs with the same number of related
2026                  accesses prefer the one where the misalign
2027                  computation will be invariant in the outermost loop.  */
2028               else if (dr0_same_align_drs == same_align_drs)
2029                 {
2030                   class loop *ivloop0, *ivloop;
2031                   ivloop0 = outermost_invariant_loop_for_expr
2032                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2033                   ivloop = outermost_invariant_loop_for_expr
2034                     (loop, DR_BASE_ADDRESS (dr));
2035                   if ((ivloop && !ivloop0)
2036                       || (ivloop && ivloop0
2037                           && flow_loop_nested_p (ivloop, ivloop0)))
2038                     dr0_info = dr_info;
2039                 }
2040
2041               one_misalignment_unknown = true;
2042
2043               /* Check for data refs with unsupportable alignment that
2044                  can be peeled.  */
2045               enum dr_alignment_support supportable_dr_alignment
2046                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2047                                                  DR_MISALIGNMENT_UNKNOWN);
2048               if (supportable_dr_alignment == dr_unaligned_unsupported)
2049                 {
2050                   one_dr_unsupportable = true;
2051                   unsupportable_dr_info = dr_info;
2052                 }
2053
2054               if (!first_store && DR_IS_WRITE (dr))
2055                 {
2056                   first_store = dr_info;
2057                   first_store_same_align_drs = same_align_drs;
2058                 }
2059             }
2060         }
2061       else
2062         {
2063           if (!aligned_access_p (dr_info, vectype))
2064             {
2065               if (dump_enabled_p ())
2066                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2067                                  "vector alignment may not be reachable\n");
2068               break;
2069             }
2070         }
2071     }
2072
2073   /* Check if we can possibly peel the loop.  */
2074   if (!vect_can_advance_ivs_p (loop_vinfo)
2075       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2076       || loop->inner)
2077     do_peeling = false;
2078
2079   struct _vect_peel_extended_info peel_for_known_alignment;
2080   struct _vect_peel_extended_info peel_for_unknown_alignment;
2081   struct _vect_peel_extended_info best_peel;
2082
2083   peel_for_unknown_alignment.inside_cost = INT_MAX;
2084   peel_for_unknown_alignment.outside_cost = INT_MAX;
2085   peel_for_unknown_alignment.peel_info.count = 0;
2086
2087   if (do_peeling
2088       && one_misalignment_unknown)
2089     {
2090       /* Check if the target requires to prefer stores over loads, i.e., if
2091          misaligned stores are more expensive than misaligned loads (taking
2092          drs with same alignment into account).  */
2093       unsigned int load_inside_cost = 0;
2094       unsigned int load_outside_cost = 0;
2095       unsigned int store_inside_cost = 0;
2096       unsigned int store_outside_cost = 0;
2097       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2098
2099       stmt_vector_for_cost dummy;
2100       dummy.create (2);
2101       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2102                                       &load_inside_cost,
2103                                       &load_outside_cost,
2104                                       &dummy, &dummy, estimated_npeels);
2105       dummy.release ();
2106
2107       if (first_store)
2108         {
2109           dummy.create (2);
2110           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2111                                           &store_inside_cost,
2112                                           &store_outside_cost,
2113                                           &dummy, &dummy,
2114                                           estimated_npeels);
2115           dummy.release ();
2116         }
2117       else
2118         {
2119           store_inside_cost = INT_MAX;
2120           store_outside_cost = INT_MAX;
2121         }
2122
2123       if (load_inside_cost > store_inside_cost
2124           || (load_inside_cost == store_inside_cost
2125               && load_outside_cost > store_outside_cost))
2126         {
2127           dr0_info = first_store;
2128           dr0_same_align_drs = first_store_same_align_drs;
2129           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2130           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2131         }
2132       else
2133         {
2134           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2135           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2136         }
2137
2138       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2139       prologue_cost_vec.create (2);
2140       epilogue_cost_vec.create (2);
2141
2142       int dummy2;
2143       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2144         (loop_vinfo, estimated_npeels, &dummy2,
2145          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2146          &prologue_cost_vec, &epilogue_cost_vec);
2147
2148       prologue_cost_vec.release ();
2149       epilogue_cost_vec.release ();
2150
2151       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2152     }
2153
2154   peel_for_unknown_alignment.peel_info.npeel = 0;
2155   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2156
2157   best_peel = peel_for_unknown_alignment;
2158
2159   peel_for_known_alignment.inside_cost = INT_MAX;
2160   peel_for_known_alignment.outside_cost = INT_MAX;
2161   peel_for_known_alignment.peel_info.count = 0;
2162   peel_for_known_alignment.peel_info.dr_info = NULL;
2163
2164   if (do_peeling && one_misalignment_known)
2165     {
2166       /* Peeling is possible, but there is no data access that is not supported
2167          unless aligned.  So we try to choose the best possible peeling from
2168          the hash table.  */
2169       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2170         (&peeling_htab, loop_vinfo);
2171     }
2172
2173   /* Compare costs of peeling for known and unknown alignment. */
2174   if (peel_for_known_alignment.peel_info.dr_info != NULL
2175       && peel_for_unknown_alignment.inside_cost
2176       >= peel_for_known_alignment.inside_cost)
2177     {
2178       best_peel = peel_for_known_alignment;
2179
2180       /* If the best peeling for known alignment has NPEEL == 0, perform no
2181          peeling at all except if there is an unsupportable dr that we can
2182          align.  */
2183       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2184         do_peeling = false;
2185     }
2186
2187   /* If there is an unsupportable data ref, prefer this over all choices so far
2188      since we'd have to discard a chosen peeling except when it accidentally
2189      aligned the unsupportable data ref.  */
2190   if (one_dr_unsupportable)
2191     dr0_info = unsupportable_dr_info;
2192   else if (do_peeling)
2193     {
2194       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2195          TODO: Use nopeel_outside_cost or get rid of it?  */
2196       unsigned nopeel_inside_cost = 0;
2197       unsigned nopeel_outside_cost = 0;
2198
2199       stmt_vector_for_cost dummy;
2200       dummy.create (2);
2201       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2202                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2203       dummy.release ();
2204
2205       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2206          costs will be recorded.  */
2207       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2208       prologue_cost_vec.create (2);
2209       epilogue_cost_vec.create (2);
2210
2211       int dummy2;
2212       nopeel_outside_cost += vect_get_known_peeling_cost
2213         (loop_vinfo, 0, &dummy2,
2214          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2215          &prologue_cost_vec, &epilogue_cost_vec);
2216
2217       prologue_cost_vec.release ();
2218       epilogue_cost_vec.release ();
2219
2220       npeel = best_peel.peel_info.npeel;
2221       dr0_info = best_peel.peel_info.dr_info;
2222
2223       /* If no peeling is not more expensive than the best peeling we
2224          have so far, don't perform any peeling.  */
2225       if (nopeel_inside_cost <= best_peel.inside_cost)
2226         do_peeling = false;
2227     }
2228
2229   if (do_peeling)
2230     {
2231       stmt_vec_info stmt_info = dr0_info->stmt;
2232       if (known_alignment_for_access_p (dr0_info,
2233                                         STMT_VINFO_VECTYPE (stmt_info)))
2234         {
2235           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2236                                                 size_zero_node) < 0;
2237           if (!npeel)
2238             {
2239               /* Since it's known at compile time, compute the number of
2240                  iterations in the peeled loop (the peeling factor) for use in
2241                  updating DR_MISALIGNMENT values.  The peeling factor is the
2242                  vectorization factor minus the misalignment as an element
2243                  count.  */
2244               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2245               poly_int64 off = 0;
2246               if (negative)
2247                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2248                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2249               unsigned int mis
2250                 = dr_misalignment (dr0_info, vectype, off);
2251               mis = negative ? mis : -mis;
2252               /* If known_alignment_for_access_p then we have set
2253                  DR_MISALIGNMENT which is only done if we know it at compiler
2254                  time, so it is safe to assume target alignment is constant.
2255                */
2256               unsigned int target_align =
2257                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2258               npeel = ((mis & (target_align - 1))
2259                        / vect_get_scalar_dr_size (dr0_info));
2260             }
2261
2262           /* For interleaved data access every iteration accesses all the
2263              members of the group, therefore we divide the number of iterations
2264              by the group size.  */
2265           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2266             npeel /= DR_GROUP_SIZE (stmt_info);
2267
2268           if (dump_enabled_p ())
2269             dump_printf_loc (MSG_NOTE, vect_location,
2270                              "Try peeling by %d\n", npeel);
2271         }
2272
2273       /* Ensure that all datarefs can be vectorized after the peel.  */
2274       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2275         do_peeling = false;
2276
2277       /* Check if all datarefs are supportable and log.  */
2278       if (do_peeling
2279           && npeel == 0
2280           && known_alignment_for_access_p (dr0_info,
2281                                            STMT_VINFO_VECTYPE (stmt_info)))
2282         return opt_result::success ();
2283
2284       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2285       if (do_peeling)
2286         {
2287           unsigned max_allowed_peel
2288             = param_vect_max_peeling_for_alignment;
2289           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2290             max_allowed_peel = 0;
2291           if (max_allowed_peel != (unsigned)-1)
2292             {
2293               unsigned max_peel = npeel;
2294               if (max_peel == 0)
2295                 {
2296                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2297                   unsigned HOST_WIDE_INT target_align_c;
2298                   if (target_align.is_constant (&target_align_c))
2299                     max_peel =
2300                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2301                   else
2302                     {
2303                       do_peeling = false;
2304                       if (dump_enabled_p ())
2305                         dump_printf_loc (MSG_NOTE, vect_location,
2306                           "Disable peeling, max peels set and vector"
2307                           " alignment unknown\n");
2308                     }
2309                 }
2310               if (max_peel > max_allowed_peel)
2311                 {
2312                   do_peeling = false;
2313                   if (dump_enabled_p ())
2314                     dump_printf_loc (MSG_NOTE, vect_location,
2315                         "Disable peeling, max peels reached: %d\n", max_peel);
2316                 }
2317             }
2318         }
2319
2320       /* Cost model #2 - if peeling may result in a remaining loop not
2321          iterating enough to be vectorized then do not peel.  Since this
2322          is a cost heuristic rather than a correctness decision, use the
2323          most likely runtime value for variable vectorization factors.  */
2324       if (do_peeling
2325           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2326         {
2327           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2328           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2329           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2330               < assumed_vf + max_peel)
2331             do_peeling = false;
2332         }
2333
2334       if (do_peeling)
2335         {
2336           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2337              If the misalignment of DR_i is identical to that of dr0 then set
2338              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2339              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2340              by the peeling factor times the element size of DR_i (MOD the
2341              vectorization factor times the size).  Otherwise, the
2342              misalignment of DR_i must be set to unknown.  */
2343           FOR_EACH_VEC_ELT (datarefs, i, dr)
2344             if (dr != dr0_info->dr)
2345               {
2346                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2347                 if (!vect_relevant_for_alignment_p (dr_info))
2348                   continue;
2349
2350                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2351               }
2352
2353           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2354           if (npeel)
2355             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2356           else
2357             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2358           SET_DR_MISALIGNMENT (dr0_info,
2359                                vect_dr_misalign_for_aligned_access (dr0_info));
2360           if (dump_enabled_p ())
2361             {
2362               dump_printf_loc (MSG_NOTE, vect_location,
2363                                "Alignment of access forced using peeling.\n");
2364               dump_printf_loc (MSG_NOTE, vect_location,
2365                                "Peeling for alignment will be applied.\n");
2366             }
2367
2368           /* The inside-loop cost will be accounted for in vectorizable_load
2369              and vectorizable_store correctly with adjusted alignments.
2370              Drop the body_cst_vec on the floor here.  */
2371           return opt_result::success ();
2372         }
2373     }
2374
2375   /* (2) Versioning to force alignment.  */
2376
2377   /* Try versioning if:
2378      1) optimize loop for speed and the cost-model is not cheap
2379      2) there is at least one unsupported misaligned data ref with an unknown
2380         misalignment, and
2381      3) all misaligned data refs with a known misalignment are supported, and
2382      4) the number of runtime alignment checks is within reason.  */
2383
2384   do_versioning
2385     = (optimize_loop_nest_for_speed_p (loop)
2386        && !loop->inner /* FORNOW */
2387        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2388
2389   if (do_versioning)
2390     {
2391       FOR_EACH_VEC_ELT (datarefs, i, dr)
2392         {
2393           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2394           if (!vect_relevant_for_alignment_p (dr_info))
2395             continue;
2396
2397           stmt_vec_info stmt_info = dr_info->stmt;
2398           if (STMT_VINFO_STRIDED_P (stmt_info))
2399             {
2400               do_versioning = false;
2401               break;
2402             }
2403
2404           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2405           bool negative = tree_int_cst_compare (DR_STEP (dr),
2406                                                 size_zero_node) < 0;
2407           poly_int64 off = 0;
2408           if (negative)
2409             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2410                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2411           int misalignment;
2412           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2413             continue;
2414
2415           enum dr_alignment_support supportable_dr_alignment
2416             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2417                                              misalignment);
2418           if (supportable_dr_alignment == dr_unaligned_unsupported)
2419             {
2420               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2421                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2422                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2423                 {
2424                   do_versioning = false;
2425                   break;
2426                 }
2427
2428               /* At present we don't support versioning for alignment
2429                  with variable VF, since there's no guarantee that the
2430                  VF is a power of two.  We could relax this if we added
2431                  a way of enforcing a power-of-two size.  */
2432               unsigned HOST_WIDE_INT size;
2433               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2434                 {
2435                   do_versioning = false;
2436                   break;
2437                 }
2438
2439               /* Forcing alignment in the first iteration is no good if
2440                  we don't keep it across iterations.  For now, just disable
2441                  versioning in this case.
2442                  ?? We could actually unroll the loop to achieve the required
2443                  overall step alignment, and forcing the alignment could be
2444                  done by doing some iterations of the non-vectorized loop.  */
2445               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2446                                * DR_STEP_ALIGNMENT (dr),
2447                                DR_TARGET_ALIGNMENT (dr_info)))
2448                 {
2449                   do_versioning = false;
2450                   break;
2451                 }
2452
2453               /* The rightmost bits of an aligned address must be zeros.
2454                  Construct the mask needed for this test.  For example,
2455                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2456                  mask must be 15 = 0xf. */
2457               int mask = size - 1;
2458
2459               /* FORNOW: use the same mask to test all potentially unaligned
2460                  references in the loop.  */
2461               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2462                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2463                 {
2464                   do_versioning = false;
2465                   break;
2466                 }
2467
2468               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2469               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2470             }
2471         }
2472
2473       /* Versioning requires at least one misaligned data reference.  */
2474       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2475         do_versioning = false;
2476       else if (!do_versioning)
2477         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2478     }
2479
2480   if (do_versioning)
2481     {
2482       const vec<stmt_vec_info> &may_misalign_stmts
2483         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2484       stmt_vec_info stmt_info;
2485
2486       /* It can now be assumed that the data references in the statements
2487          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2488          of the loop being vectorized.  */
2489       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2490         {
2491           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2492           SET_DR_MISALIGNMENT (dr_info,
2493                                vect_dr_misalign_for_aligned_access (dr_info));
2494           if (dump_enabled_p ())
2495             dump_printf_loc (MSG_NOTE, vect_location,
2496                              "Alignment of access forced using versioning.\n");
2497         }
2498
2499       if (dump_enabled_p ())
2500         dump_printf_loc (MSG_NOTE, vect_location,
2501                          "Versioning for alignment will be applied.\n");
2502
2503       /* Peeling and versioning can't be done together at this time.  */
2504       gcc_assert (! (do_peeling && do_versioning));
2505
2506       return opt_result::success ();
2507     }
2508
2509   /* This point is reached if neither peeling nor versioning is being done.  */
2510   gcc_assert (! (do_peeling || do_versioning));
2511
2512   return opt_result::success ();
2513 }
2514
2515
2516 /* Function vect_analyze_data_refs_alignment
2517
2518    Analyze the alignment of the data-references in the loop.
2519    Return FALSE if a data reference is found that cannot be vectorized.  */
2520
2521 opt_result
2522 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2523 {
2524   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2525
2526   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2527   struct data_reference *dr;
2528   unsigned int i;
2529
2530   vect_record_base_alignments (loop_vinfo);
2531   FOR_EACH_VEC_ELT (datarefs, i, dr)
2532     {
2533       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2534       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2535         {
2536           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2537               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2538             continue;
2539           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2540                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2541         }
2542     }
2543
2544   return opt_result::success ();
2545 }
2546
2547
2548 /* Analyze alignment of DRs of stmts in NODE.  */
2549
2550 static bool
2551 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2552 {
2553   /* Alignment is maintained in the first element of the group.  */
2554   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2555   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2556   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2557   tree vectype = SLP_TREE_VECTYPE (node);
2558   poly_uint64 vector_alignment
2559     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2560                  BITS_PER_UNIT);
2561   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2562     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2563   /* Re-analyze alignment when we're facing a vectorization with a bigger
2564      alignment requirement.  */
2565   else if (known_lt (dr_info->target_alignment, vector_alignment))
2566     {
2567       poly_uint64 old_target_alignment = dr_info->target_alignment;
2568       int old_misalignment = dr_info->misalignment;
2569       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2570       /* But keep knowledge about a smaller alignment.  */
2571       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2572           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2573         {
2574           dr_info->target_alignment = old_target_alignment;
2575           dr_info->misalignment = old_misalignment;
2576         }
2577     }
2578   /* When we ever face unordered target alignments the first one wins in terms
2579      of analyzing and the other will become unknown in dr_misalignment.  */
2580   return true;
2581 }
2582
2583 /* Function vect_slp_analyze_instance_alignment
2584
2585    Analyze the alignment of the data-references in the SLP instance.
2586    Return FALSE if a data reference is found that cannot be vectorized.  */
2587
2588 bool
2589 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2590                                                 slp_instance instance)
2591 {
2592   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2593
2594   slp_tree node;
2595   unsigned i;
2596   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2597     if (! vect_slp_analyze_node_alignment (vinfo, node))
2598       return false;
2599
2600   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2601       && ! vect_slp_analyze_node_alignment
2602              (vinfo, SLP_INSTANCE_TREE (instance)))
2603     return false;
2604
2605   return true;
2606 }
2607
2608
2609 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2610    accesses of legal size, step, etc.  Detect gaps, single element
2611    interleaving, and other special cases. Set grouped access info.
2612    Collect groups of strided stores for further use in SLP analysis.
2613    Worker for vect_analyze_group_access.  */
2614
2615 static bool
2616 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2617 {
2618   data_reference *dr = dr_info->dr;
2619   tree step = DR_STEP (dr);
2620   tree scalar_type = TREE_TYPE (DR_REF (dr));
2621   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2622   stmt_vec_info stmt_info = dr_info->stmt;
2623   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2624   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2625   HOST_WIDE_INT dr_step = -1;
2626   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2627   bool slp_impossible = false;
2628
2629   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2630      size of the interleaving group (including gaps).  */
2631   if (tree_fits_shwi_p (step))
2632     {
2633       dr_step = tree_to_shwi (step);
2634       /* Check that STEP is a multiple of type size.  Otherwise there is
2635          a non-element-sized gap at the end of the group which we
2636          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2637          ???  As we can handle non-constant step fine here we should
2638          simply remove uses of DR_GROUP_GAP between the last and first
2639          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2640          simply not include that gap.  */
2641       if ((dr_step % type_size) != 0)
2642         {
2643           if (dump_enabled_p ())
2644             dump_printf_loc (MSG_NOTE, vect_location,
2645                              "Step %T is not a multiple of the element size"
2646                              " for %T\n",
2647                              step, DR_REF (dr));
2648           return false;
2649         }
2650       groupsize = absu_hwi (dr_step) / type_size;
2651     }
2652   else
2653     groupsize = 0;
2654
2655   /* Not consecutive access is possible only if it is a part of interleaving.  */
2656   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2657     {
2658       /* Check if it this DR is a part of interleaving, and is a single
2659          element of the group that is accessed in the loop.  */
2660
2661       /* Gaps are supported only for loads. STEP must be a multiple of the type
2662          size.  */
2663       if (DR_IS_READ (dr)
2664           && (dr_step % type_size) == 0
2665           && groupsize > 0
2666           /* This could be UINT_MAX but as we are generating code in a very
2667              inefficient way we have to cap earlier.
2668              See PR91403 for example.  */
2669           && groupsize <= 4096)
2670         {
2671           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2672           DR_GROUP_SIZE (stmt_info) = groupsize;
2673           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2674           if (dump_enabled_p ())
2675             dump_printf_loc (MSG_NOTE, vect_location,
2676                              "Detected single element interleaving %T"
2677                              " step %T\n",
2678                              DR_REF (dr), step);
2679
2680           return true;
2681         }
2682
2683       if (dump_enabled_p ())
2684         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2685                          "not consecutive access %G", stmt_info->stmt);
2686
2687       if (bb_vinfo)
2688         {
2689           /* Mark the statement as unvectorizable.  */
2690           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2691           return true;
2692         }
2693
2694       if (dump_enabled_p ())
2695         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2696       STMT_VINFO_STRIDED_P (stmt_info) = true;
2697       return true;
2698     }
2699
2700   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2701     {
2702       /* First stmt in the interleaving chain. Check the chain.  */
2703       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2704       struct data_reference *data_ref = dr;
2705       unsigned int count = 1;
2706       tree prev_init = DR_INIT (data_ref);
2707       HOST_WIDE_INT diff, gaps = 0;
2708
2709       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2710       while (next)
2711         {
2712           /* We never have the same DR multiple times.  */
2713           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2714                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2715
2716           data_ref = STMT_VINFO_DATA_REF (next);
2717
2718           /* All group members have the same STEP by construction.  */
2719           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2720
2721           /* Check that the distance between two accesses is equal to the type
2722              size. Otherwise, we have gaps.  */
2723           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2724                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2725           if (diff < 1 || diff > UINT_MAX)
2726             {
2727               /* For artificial testcases with array accesses with large
2728                  constant indices we can run into overflow issues which
2729                  can end up fooling the groupsize constraint below so
2730                  check the individual gaps (which are represented as
2731                  unsigned int) as well.  */
2732               if (dump_enabled_p ())
2733                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2734                                  "interleaved access with gap larger "
2735                                  "than representable\n");
2736               return false;
2737             }
2738           if (diff != 1)
2739             {
2740               /* FORNOW: SLP of accesses with gaps is not supported.  */
2741               slp_impossible = true;
2742               if (DR_IS_WRITE (data_ref))
2743                 {
2744                   if (dump_enabled_p ())
2745                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2746                                      "interleaved store with gaps\n");
2747                   return false;
2748                 }
2749
2750               gaps += diff - 1;
2751             }
2752
2753           last_accessed_element += diff;
2754
2755           /* Store the gap from the previous member of the group. If there is no
2756              gap in the access, DR_GROUP_GAP is always 1.  */
2757           DR_GROUP_GAP (next) = diff;
2758
2759           prev_init = DR_INIT (data_ref);
2760           next = DR_GROUP_NEXT_ELEMENT (next);
2761           /* Count the number of data-refs in the chain.  */
2762           count++;
2763         }
2764
2765       if (groupsize == 0)
2766         groupsize = count + gaps;
2767
2768       /* This could be UINT_MAX but as we are generating code in a very
2769          inefficient way we have to cap earlier.  See PR78699 for example.  */
2770       if (groupsize > 4096)
2771         {
2772           if (dump_enabled_p ())
2773             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2774                              "group is too large\n");
2775           return false;
2776         }
2777
2778       /* Check that the size of the interleaving is equal to count for stores,
2779          i.e., that there are no gaps.  */
2780       if (groupsize != count
2781           && !DR_IS_READ (dr))
2782         {
2783           groupsize = count;
2784           STMT_VINFO_STRIDED_P (stmt_info) = true;
2785         }
2786
2787       /* If there is a gap after the last load in the group it is the
2788          difference between the groupsize and the last accessed
2789          element.
2790          When there is no gap, this difference should be 0.  */
2791       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2792
2793       DR_GROUP_SIZE (stmt_info) = groupsize;
2794       if (dump_enabled_p ())
2795         {
2796           dump_printf_loc (MSG_NOTE, vect_location,
2797                            "Detected interleaving ");
2798           if (DR_IS_READ (dr))
2799             dump_printf (MSG_NOTE, "load ");
2800           else if (STMT_VINFO_STRIDED_P (stmt_info))
2801             dump_printf (MSG_NOTE, "strided store ");
2802           else
2803             dump_printf (MSG_NOTE, "store ");
2804           dump_printf (MSG_NOTE, "of size %u\n",
2805                        (unsigned)groupsize);
2806           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2807           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2808           while (next)
2809             {
2810               if (DR_GROUP_GAP (next) != 1)
2811                 dump_printf_loc (MSG_NOTE, vect_location,
2812                                  "\t<gap of %d elements>\n",
2813                                  DR_GROUP_GAP (next) - 1);
2814               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2815               next = DR_GROUP_NEXT_ELEMENT (next);
2816             }
2817           if (DR_GROUP_GAP (stmt_info) != 0)
2818             dump_printf_loc (MSG_NOTE, vect_location,
2819                              "\t<gap of %d elements>\n",
2820                              DR_GROUP_GAP (stmt_info));
2821         }
2822
2823       /* SLP: create an SLP data structure for every interleaving group of
2824          stores for further analysis in vect_analyse_slp.  */
2825       if (DR_IS_WRITE (dr) && !slp_impossible)
2826         {
2827           if (loop_vinfo)
2828             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2829           if (bb_vinfo)
2830             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2831         }
2832     }
2833
2834   return true;
2835 }
2836
2837 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2838    accesses of legal size, step, etc.  Detect gaps, single element
2839    interleaving, and other special cases. Set grouped access info.
2840    Collect groups of strided stores for further use in SLP analysis.  */
2841
2842 static bool
2843 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2844 {
2845   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2846     {
2847       /* Dissolve the group if present.  */
2848       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2849       while (stmt_info)
2850         {
2851           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2852           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2853           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2854           stmt_info = next;
2855         }
2856       return false;
2857     }
2858   return true;
2859 }
2860
2861 /* Analyze the access pattern of the data-reference DR_INFO.
2862    In case of non-consecutive accesses call vect_analyze_group_access() to
2863    analyze groups of accesses.  */
2864
2865 static bool
2866 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2867 {
2868   data_reference *dr = dr_info->dr;
2869   tree step = DR_STEP (dr);
2870   tree scalar_type = TREE_TYPE (DR_REF (dr));
2871   stmt_vec_info stmt_info = dr_info->stmt;
2872   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2873   class loop *loop = NULL;
2874
2875   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2876     return true;
2877
2878   if (loop_vinfo)
2879     loop = LOOP_VINFO_LOOP (loop_vinfo);
2880
2881   if (loop_vinfo && !step)
2882     {
2883       if (dump_enabled_p ())
2884         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2885                          "bad data-ref access in loop\n");
2886       return false;
2887     }
2888
2889   /* Allow loads with zero step in inner-loop vectorization.  */
2890   if (loop_vinfo && integer_zerop (step))
2891     {
2892       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2893       if (!nested_in_vect_loop_p (loop, stmt_info))
2894         return DR_IS_READ (dr);
2895       /* Allow references with zero step for outer loops marked
2896          with pragma omp simd only - it guarantees absence of
2897          loop-carried dependencies between inner loop iterations.  */
2898       if (loop->safelen < 2)
2899         {
2900           if (dump_enabled_p ())
2901             dump_printf_loc (MSG_NOTE, vect_location,
2902                              "zero step in inner loop of nest\n");
2903           return false;
2904         }
2905     }
2906
2907   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2908     {
2909       /* Interleaved accesses are not yet supported within outer-loop
2910         vectorization for references in the inner-loop.  */
2911       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2912
2913       /* For the rest of the analysis we use the outer-loop step.  */
2914       step = STMT_VINFO_DR_STEP (stmt_info);
2915       if (integer_zerop (step))
2916         {
2917           if (dump_enabled_p ())
2918             dump_printf_loc (MSG_NOTE, vect_location,
2919                              "zero step in outer loop.\n");
2920           return DR_IS_READ (dr);
2921         }
2922     }
2923
2924   /* Consecutive?  */
2925   if (TREE_CODE (step) == INTEGER_CST)
2926     {
2927       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2928       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2929           || (dr_step < 0
2930               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2931         {
2932           /* Mark that it is not interleaving.  */
2933           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2934           return true;
2935         }
2936     }
2937
2938   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2939     {
2940       if (dump_enabled_p ())
2941         dump_printf_loc (MSG_NOTE, vect_location,
2942                          "grouped access in outer loop.\n");
2943       return false;
2944     }
2945
2946
2947   /* Assume this is a DR handled by non-constant strided load case.  */
2948   if (TREE_CODE (step) != INTEGER_CST)
2949     return (STMT_VINFO_STRIDED_P (stmt_info)
2950             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2951                 || vect_analyze_group_access (vinfo, dr_info)));
2952
2953   /* Not consecutive access - check if it's a part of interleaving group.  */
2954   return vect_analyze_group_access (vinfo, dr_info);
2955 }
2956
2957 /* Compare two data-references DRA and DRB to group them into chunks
2958    suitable for grouping.  */
2959
2960 static int
2961 dr_group_sort_cmp (const void *dra_, const void *drb_)
2962 {
2963   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2964   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2965   data_reference_p dra = dra_info->dr;
2966   data_reference_p drb = drb_info->dr;
2967   int cmp;
2968
2969   /* Stabilize sort.  */
2970   if (dra == drb)
2971     return 0;
2972
2973   /* Different group IDs lead never belong to the same group.  */
2974   if (dra_info->group != drb_info->group)
2975     return dra_info->group < drb_info->group ? -1 : 1;
2976
2977   /* Ordering of DRs according to base.  */
2978   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2979                                DR_BASE_ADDRESS (drb));
2980   if (cmp != 0)
2981     return cmp;
2982
2983   /* And according to DR_OFFSET.  */
2984   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2985   if (cmp != 0)
2986     return cmp;
2987
2988   /* Put reads before writes.  */
2989   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2990     return DR_IS_READ (dra) ? -1 : 1;
2991
2992   /* Then sort after access size.  */
2993   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2994                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2995   if (cmp != 0)
2996     return cmp;
2997
2998   /* And after step.  */
2999   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3000   if (cmp != 0)
3001     return cmp;
3002
3003   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3004   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3005   if (cmp == 0)
3006     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3007   return cmp;
3008 }
3009
3010 /* If OP is the result of a conversion, return the unconverted value,
3011    otherwise return null.  */
3012
3013 static tree
3014 strip_conversion (tree op)
3015 {
3016   if (TREE_CODE (op) != SSA_NAME)
3017     return NULL_TREE;
3018   gimple *stmt = SSA_NAME_DEF_STMT (op);
3019   if (!is_gimple_assign (stmt)
3020       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3021     return NULL_TREE;
3022   return gimple_assign_rhs1 (stmt);
3023 }
3024
3025 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3026    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3027    be grouped in SLP mode.  */
3028
3029 static bool
3030 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3031                    bool allow_slp_p)
3032 {
3033   if (gimple_assign_single_p (stmt1_info->stmt))
3034     return gimple_assign_single_p (stmt2_info->stmt);
3035
3036   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3037   if (call1 && gimple_call_internal_p (call1))
3038     {
3039       /* Check for two masked loads or two masked stores.  */
3040       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3041       if (!call2 || !gimple_call_internal_p (call2))
3042         return false;
3043       internal_fn ifn = gimple_call_internal_fn (call1);
3044       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3045         return false;
3046       if (ifn != gimple_call_internal_fn (call2))
3047         return false;
3048
3049       /* Check that the masks are the same.  Cope with casts of masks,
3050          like those created by build_mask_conversion.  */
3051       tree mask1 = gimple_call_arg (call1, 2);
3052       tree mask2 = gimple_call_arg (call2, 2);
3053       if (!operand_equal_p (mask1, mask2, 0)
3054           && (ifn == IFN_MASK_STORE || !allow_slp_p))
3055         {
3056           mask1 = strip_conversion (mask1);
3057           if (!mask1)
3058             return false;
3059           mask2 = strip_conversion (mask2);
3060           if (!mask2)
3061             return false;
3062           if (!operand_equal_p (mask1, mask2, 0))
3063             return false;
3064         }
3065       return true;
3066     }
3067
3068   return false;
3069 }
3070
3071 /* Function vect_analyze_data_ref_accesses.
3072
3073    Analyze the access pattern of all the data references in the loop.
3074
3075    FORNOW: the only access pattern that is considered vectorizable is a
3076            simple step 1 (consecutive) access.
3077
3078    FORNOW: handle only arrays and pointer accesses.  */
3079
3080 opt_result
3081 vect_analyze_data_ref_accesses (vec_info *vinfo,
3082                                 vec<int> *dataref_groups)
3083 {
3084   unsigned int i;
3085   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3086
3087   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3088
3089   if (datarefs.is_empty ())
3090     return opt_result::success ();
3091
3092   /* Sort the array of datarefs to make building the interleaving chains
3093      linear.  Don't modify the original vector's order, it is needed for
3094      determining what dependencies are reversed.  */
3095   vec<dr_vec_info *> datarefs_copy;
3096   datarefs_copy.create (datarefs.length ());
3097   for (unsigned i = 0; i < datarefs.length (); i++)
3098     {
3099       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3100       /* If the caller computed DR grouping use that, otherwise group by
3101          basic blocks.  */
3102       if (dataref_groups)
3103         dr_info->group = (*dataref_groups)[i];
3104       else
3105         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3106       datarefs_copy.quick_push (dr_info);
3107     }
3108   datarefs_copy.qsort (dr_group_sort_cmp);
3109   hash_set<stmt_vec_info> to_fixup;
3110
3111   /* Build the interleaving chains.  */
3112   for (i = 0; i < datarefs_copy.length () - 1;)
3113     {
3114       dr_vec_info *dr_info_a = datarefs_copy[i];
3115       data_reference_p dra = dr_info_a->dr;
3116       int dra_group_id = dr_info_a->group;
3117       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3118       stmt_vec_info lastinfo = NULL;
3119       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3120           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3121         {
3122           ++i;
3123           continue;
3124         }
3125       for (i = i + 1; i < datarefs_copy.length (); ++i)
3126         {
3127           dr_vec_info *dr_info_b = datarefs_copy[i];
3128           data_reference_p drb = dr_info_b->dr;
3129           int drb_group_id = dr_info_b->group;
3130           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3131           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3132               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3133             break;
3134
3135           /* ???  Imperfect sorting (non-compatible types, non-modulo
3136              accesses, same accesses) can lead to a group to be artificially
3137              split here as we don't just skip over those.  If it really
3138              matters we can push those to a worklist and re-iterate
3139              over them.  The we can just skip ahead to the next DR here.  */
3140
3141           /* DRs in a different DR group should not be put into the same
3142              interleaving group.  */
3143           if (dra_group_id != drb_group_id)
3144             break;
3145
3146           /* Check that the data-refs have same first location (except init)
3147              and they are both either store or load (not load and store,
3148              not masked loads or stores).  */
3149           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3150               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3151                                         DR_BASE_ADDRESS (drb)) != 0
3152               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3153               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3154             break;
3155
3156           /* Check that the data-refs have the same constant size.  */
3157           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3158           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3159           if (!tree_fits_uhwi_p (sza)
3160               || !tree_fits_uhwi_p (szb)
3161               || !tree_int_cst_equal (sza, szb))
3162             break;
3163
3164           /* Check that the data-refs have the same step.  */
3165           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3166             break;
3167
3168           /* Check the types are compatible.
3169              ???  We don't distinguish this during sorting.  */
3170           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3171                                    TREE_TYPE (DR_REF (drb))))
3172             break;
3173
3174           /* Check that the DR_INITs are compile-time constants.  */
3175           if (!tree_fits_shwi_p (DR_INIT (dra))
3176               || !tree_fits_shwi_p (DR_INIT (drb)))
3177             break;
3178
3179           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3180              just hold extra information.  */
3181           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3182               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3183               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3184             break;
3185
3186           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3187           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3188           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3189           HOST_WIDE_INT init_prev
3190             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3191           gcc_assert (init_a <= init_b
3192                       && init_a <= init_prev
3193                       && init_prev <= init_b);
3194
3195           /* Do not place the same access in the interleaving chain twice.  */
3196           if (init_b == init_prev)
3197             {
3198               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3199                           < gimple_uid (DR_STMT (drb)));
3200               /* Simply link in duplicates and fix up the chain below.  */
3201             }
3202           else
3203             {
3204               /* If init_b == init_a + the size of the type * k, we have an
3205                  interleaving, and DRA is accessed before DRB.  */
3206               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3207               if (type_size_a == 0
3208                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3209                       % type_size_a != 0))
3210                 break;
3211
3212               /* If we have a store, the accesses are adjacent.  This splits
3213                  groups into chunks we support (we don't support vectorization
3214                  of stores with gaps).  */
3215               if (!DR_IS_READ (dra)
3216                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3217                       != type_size_a))
3218                 break;
3219
3220               /* If the step (if not zero or non-constant) is smaller than the
3221                  difference between data-refs' inits this splits groups into
3222                  suitable sizes.  */
3223               if (tree_fits_shwi_p (DR_STEP (dra)))
3224                 {
3225                   unsigned HOST_WIDE_INT step
3226                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3227                   if (step != 0
3228                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3229                     break;
3230                 }
3231             }
3232
3233           if (dump_enabled_p ())
3234             dump_printf_loc (MSG_NOTE, vect_location,
3235                              DR_IS_READ (dra)
3236                              ? "Detected interleaving load %T and %T\n"
3237                              : "Detected interleaving store %T and %T\n",
3238                              DR_REF (dra), DR_REF (drb));
3239
3240           /* Link the found element into the group list.  */
3241           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3242             {
3243               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3244               lastinfo = stmtinfo_a;
3245             }
3246           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3247           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3248           lastinfo = stmtinfo_b;
3249
3250           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3251             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3252
3253           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3254             dump_printf_loc (MSG_NOTE, vect_location,
3255                              "Load suitable for SLP vectorization only.\n");
3256
3257           if (init_b == init_prev
3258               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3259               && dump_enabled_p ())
3260             dump_printf_loc (MSG_NOTE, vect_location,
3261                              "Queuing group with duplicate access for fixup\n");
3262         }
3263     }
3264
3265   /* Fixup groups with duplicate entries by splitting it.  */
3266   while (1)
3267     {
3268       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3269       if (!(it != to_fixup.end ()))
3270         break;
3271       stmt_vec_info grp = *it;
3272       to_fixup.remove (grp);
3273
3274       /* Find the earliest duplicate group member.  */
3275       unsigned first_duplicate = -1u;
3276       stmt_vec_info next, g = grp;
3277       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3278         {
3279           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3280                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3281               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3282             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3283           g = next;
3284         }
3285       if (first_duplicate == -1U)
3286         continue;
3287
3288       /* Then move all stmts after the first duplicate to a new group.
3289          Note this is a heuristic but one with the property that *it
3290          is fixed up completely.  */
3291       g = grp;
3292       stmt_vec_info newgroup = NULL, ng = grp;
3293       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3294         {
3295           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3296             {
3297               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3298               if (!newgroup)
3299                 newgroup = next;
3300               else
3301                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3302               ng = next;
3303               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3304             }
3305           else
3306             g = DR_GROUP_NEXT_ELEMENT (g);
3307         }
3308       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3309
3310       /* Fixup the new group which still may contain duplicates.  */
3311       to_fixup.add (newgroup);
3312     }
3313
3314   dr_vec_info *dr_info;
3315   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3316     {
3317       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3318           && !vect_analyze_data_ref_access (vinfo, dr_info))
3319         {
3320           if (dump_enabled_p ())
3321             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3322                              "not vectorized: complicated access pattern.\n");
3323
3324           if (is_a <bb_vec_info> (vinfo))
3325             {
3326               /* Mark the statement as not vectorizable.  */
3327               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3328               continue;
3329             }
3330           else
3331             {
3332               datarefs_copy.release ();
3333               return opt_result::failure_at (dr_info->stmt->stmt,
3334                                              "not vectorized:"
3335                                              " complicated access pattern.\n");
3336             }
3337         }
3338     }
3339
3340   datarefs_copy.release ();
3341   return opt_result::success ();
3342 }
3343
3344 /* Function vect_vfa_segment_size.
3345
3346    Input:
3347      DR_INFO: The data reference.
3348      LENGTH_FACTOR: segment length to consider.
3349
3350    Return a value suitable for the dr_with_seg_len::seg_len field.
3351    This is the "distance travelled" by the pointer from the first
3352    iteration in the segment to the last.  Note that it does not include
3353    the size of the access; in effect it only describes the first byte.  */
3354
3355 static tree
3356 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3357 {
3358   length_factor = size_binop (MINUS_EXPR,
3359                               fold_convert (sizetype, length_factor),
3360                               size_one_node);
3361   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3362                      length_factor);
3363 }
3364
3365 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3366    gives the worst-case number of bytes covered by the segment.  */
3367
3368 static unsigned HOST_WIDE_INT
3369 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3370 {
3371   stmt_vec_info stmt_vinfo = dr_info->stmt;
3372   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3373   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3374   unsigned HOST_WIDE_INT access_size = ref_size;
3375   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3376     {
3377       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3378       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3379     }
3380   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3381   int misalignment;
3382   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3383       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3384       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3385           == dr_explicit_realign_optimized))
3386     {
3387       /* We might access a full vector's worth.  */
3388       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3389     }
3390   return access_size;
3391 }
3392
3393 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3394    describes.  */
3395
3396 static unsigned int
3397 vect_vfa_align (dr_vec_info *dr_info)
3398 {
3399   return dr_alignment (dr_info->dr);
3400 }
3401
3402 /* Function vect_no_alias_p.
3403
3404    Given data references A and B with equal base and offset, see whether
3405    the alias relation can be decided at compilation time.  Return 1 if
3406    it can and the references alias, 0 if it can and the references do
3407    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3408    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3409    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3410
3411 static int
3412 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3413                          tree segment_length_a, tree segment_length_b,
3414                          unsigned HOST_WIDE_INT access_size_a,
3415                          unsigned HOST_WIDE_INT access_size_b)
3416 {
3417   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3418   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3419   poly_uint64 const_length_a;
3420   poly_uint64 const_length_b;
3421
3422   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3423      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3424      [a, a+12) */
3425   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3426     {
3427       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3428       offset_a -= const_length_a;
3429     }
3430   else
3431     const_length_a = tree_to_poly_uint64 (segment_length_a);
3432   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3433     {
3434       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3435       offset_b -= const_length_b;
3436     }
3437   else
3438     const_length_b = tree_to_poly_uint64 (segment_length_b);
3439
3440   const_length_a += access_size_a;
3441   const_length_b += access_size_b;
3442
3443   if (ranges_known_overlap_p (offset_a, const_length_a,
3444                               offset_b, const_length_b))
3445     return 1;
3446
3447   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3448                                offset_b, const_length_b))
3449     return 0;
3450
3451   return -1;
3452 }
3453
3454 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3455    in DDR is >= VF.  */
3456
3457 static bool
3458 dependence_distance_ge_vf (data_dependence_relation *ddr,
3459                            unsigned int loop_depth, poly_uint64 vf)
3460 {
3461   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3462       || DDR_NUM_DIST_VECTS (ddr) == 0)
3463     return false;
3464
3465   /* If the dependence is exact, we should have limited the VF instead.  */
3466   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3467
3468   unsigned int i;
3469   lambda_vector dist_v;
3470   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3471     {
3472       HOST_WIDE_INT dist = dist_v[loop_depth];
3473       if (dist != 0
3474           && !(dist > 0 && DDR_REVERSED_P (ddr))
3475           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3476         return false;
3477     }
3478
3479   if (dump_enabled_p ())
3480     dump_printf_loc (MSG_NOTE, vect_location,
3481                      "dependence distance between %T and %T is >= VF\n",
3482                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3483
3484   return true;
3485 }
3486
3487 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3488
3489 static void
3490 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3491 {
3492   dump_printf (dump_kind, "%s (%T) >= ",
3493                lower_bound.unsigned_p ? "unsigned" : "abs",
3494                lower_bound.expr);
3495   dump_dec (dump_kind, lower_bound.min_value);
3496 }
3497
3498 /* Record that the vectorized loop requires the vec_lower_bound described
3499    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3500
3501 static void
3502 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3503                         poly_uint64 min_value)
3504 {
3505   vec<vec_lower_bound> &lower_bounds
3506     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3507   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3508     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3509       {
3510         unsigned_p &= lower_bounds[i].unsigned_p;
3511         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3512         if (lower_bounds[i].unsigned_p != unsigned_p
3513             || maybe_lt (lower_bounds[i].min_value, min_value))
3514           {
3515             lower_bounds[i].unsigned_p = unsigned_p;
3516             lower_bounds[i].min_value = min_value;
3517             if (dump_enabled_p ())
3518               {
3519                 dump_printf_loc (MSG_NOTE, vect_location,
3520                                  "updating run-time check to ");
3521                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3522                 dump_printf (MSG_NOTE, "\n");
3523               }
3524           }
3525         return;
3526       }
3527
3528   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3529   if (dump_enabled_p ())
3530     {
3531       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3532       dump_lower_bound (MSG_NOTE, lower_bound);
3533       dump_printf (MSG_NOTE, "\n");
3534     }
3535   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3536 }
3537
3538 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3539    will span fewer than GAP bytes.  */
3540
3541 static bool
3542 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3543                   poly_int64 gap)
3544 {
3545   stmt_vec_info stmt_info = dr_info->stmt;
3546   HOST_WIDE_INT count
3547     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3548   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3549     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3550   return (estimated_poly_value (gap)
3551           <= count * vect_get_scalar_dr_size (dr_info));
3552 }
3553
3554 /* Return true if we know that there is no alias between DR_INFO_A and
3555    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3556    When returning true, set *LOWER_BOUND_OUT to this N.  */
3557
3558 static bool
3559 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3560                                 poly_uint64 *lower_bound_out)
3561 {
3562   /* Check that there is a constant gap of known sign between DR_A
3563      and DR_B.  */
3564   data_reference *dr_a = dr_info_a->dr;
3565   data_reference *dr_b = dr_info_b->dr;
3566   poly_int64 init_a, init_b;
3567   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3568       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3569       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3570       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3571       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3572       || !ordered_p (init_a, init_b))
3573     return false;
3574
3575   /* Sort DR_A and DR_B by the address they access.  */
3576   if (maybe_lt (init_b, init_a))
3577     {
3578       std::swap (init_a, init_b);
3579       std::swap (dr_info_a, dr_info_b);
3580       std::swap (dr_a, dr_b);
3581     }
3582
3583   /* If the two accesses could be dependent within a scalar iteration,
3584      make sure that we'd retain their order.  */
3585   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3586       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3587     return false;
3588
3589   /* There is no alias if abs (DR_STEP) is greater than or equal to
3590      the bytes spanned by the combination of the two accesses.  */
3591   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3592   return true;
3593 }
3594
3595 /* Function vect_prune_runtime_alias_test_list.
3596
3597    Prune a list of ddrs to be tested at run-time by versioning for alias.
3598    Merge several alias checks into one if possible.
3599    Return FALSE if resulting list of ddrs is longer then allowed by
3600    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3601
3602 opt_result
3603 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3604 {
3605   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3606   hash_set <tree_pair_hash> compared_objects;
3607
3608   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3609   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3610     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3611   const vec<vec_object_pair> &check_unequal_addrs
3612     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3613   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3614   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3615
3616   ddr_p ddr;
3617   unsigned int i;
3618   tree length_factor;
3619
3620   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3621
3622   /* Step values are irrelevant for aliasing if the number of vector
3623      iterations is equal to the number of scalar iterations (which can
3624      happen for fully-SLP loops).  */
3625   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3626
3627   if (!vf_one_p)
3628     {
3629       /* Convert the checks for nonzero steps into bound tests.  */
3630       tree value;
3631       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3632         vect_check_lower_bound (loop_vinfo, value, true, 1);
3633     }
3634
3635   if (may_alias_ddrs.is_empty ())
3636     return opt_result::success ();
3637
3638   comp_alias_ddrs.create (may_alias_ddrs.length ());
3639
3640   unsigned int loop_depth
3641     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3642                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3643
3644   /* First, we collect all data ref pairs for aliasing checks.  */
3645   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3646     {
3647       poly_uint64 lower_bound;
3648       tree segment_length_a, segment_length_b;
3649       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3650       unsigned int align_a, align_b;
3651
3652       /* Ignore the alias if the VF we chose ended up being no greater
3653          than the dependence distance.  */
3654       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3655         continue;
3656
3657       if (DDR_OBJECT_A (ddr))
3658         {
3659           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3660           if (!compared_objects.add (new_pair))
3661             {
3662               if (dump_enabled_p ())
3663                 dump_printf_loc (MSG_NOTE, vect_location,
3664                                  "checking that %T and %T"
3665                                  " have different addresses\n",
3666                                  new_pair.first, new_pair.second);
3667               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3668             }
3669           continue;
3670         }
3671
3672       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3673       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3674
3675       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3676       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3677
3678       bool preserves_scalar_order_p
3679         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3680       bool ignore_step_p
3681           = (vf_one_p
3682              && (preserves_scalar_order_p
3683                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3684                                      DR_STEP (dr_info_b->dr))));
3685
3686       /* Skip the pair if inter-iteration dependencies are irrelevant
3687          and intra-iteration dependencies are guaranteed to be honored.  */
3688       if (ignore_step_p
3689           && (preserves_scalar_order_p
3690               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3691                                                  &lower_bound)))
3692         {
3693           if (dump_enabled_p ())
3694             dump_printf_loc (MSG_NOTE, vect_location,
3695                              "no need for alias check between "
3696                              "%T and %T when VF is 1\n",
3697                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3698           continue;
3699         }
3700
3701       /* See whether we can handle the alias using a bounds check on
3702          the step, and whether that's likely to be the best approach.
3703          (It might not be, for example, if the minimum step is much larger
3704          than the number of bytes handled by one vector iteration.)  */
3705       if (!ignore_step_p
3706           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3707           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3708                                              &lower_bound)
3709           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3710               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3711         {
3712           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3713           if (dump_enabled_p ())
3714             {
3715               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3716                                "%T and %T when the step %T is outside ",
3717                                DR_REF (dr_info_a->dr),
3718                                DR_REF (dr_info_b->dr),
3719                                DR_STEP (dr_info_a->dr));
3720               if (unsigned_p)
3721                 dump_printf (MSG_NOTE, "[0");
3722               else
3723                 {
3724                   dump_printf (MSG_NOTE, "(");
3725                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3726                 }
3727               dump_printf (MSG_NOTE, ", ");
3728               dump_dec (MSG_NOTE, lower_bound);
3729               dump_printf (MSG_NOTE, ")\n");
3730             }
3731           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3732                                   unsigned_p, lower_bound);
3733           continue;
3734         }
3735
3736       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3737       if (dr_group_first_a)
3738         {
3739           stmt_info_a = dr_group_first_a;
3740           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3741         }
3742
3743       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3744       if (dr_group_first_b)
3745         {
3746           stmt_info_b = dr_group_first_b;
3747           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3748         }
3749
3750       if (ignore_step_p)
3751         {
3752           segment_length_a = size_zero_node;
3753           segment_length_b = size_zero_node;
3754         }
3755       else
3756         {
3757           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3758                                 DR_STEP (dr_info_b->dr), 0))
3759             length_factor = scalar_loop_iters;
3760           else
3761             length_factor = size_int (vect_factor);
3762           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3763           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3764         }
3765       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3766       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3767       align_a = vect_vfa_align (dr_info_a);
3768       align_b = vect_vfa_align (dr_info_b);
3769
3770       /* See whether the alias is known at compilation time.  */
3771       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3772                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3773           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3774                               DR_OFFSET (dr_info_b->dr), 0)
3775           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3776           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3777           && poly_int_tree_p (segment_length_a)
3778           && poly_int_tree_p (segment_length_b))
3779         {
3780           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3781                                              segment_length_a,
3782                                              segment_length_b,
3783                                              access_size_a,
3784                                              access_size_b);
3785           if (res >= 0 && dump_enabled_p ())
3786             {
3787               dump_printf_loc (MSG_NOTE, vect_location,
3788                                "can tell at compile time that %T and %T",
3789                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3790               if (res == 0)
3791                 dump_printf (MSG_NOTE, " do not alias\n");
3792               else
3793                 dump_printf (MSG_NOTE, " alias\n");
3794             }
3795
3796           if (res == 0)
3797             continue;
3798
3799           if (res == 1)
3800             return opt_result::failure_at (stmt_info_b->stmt,
3801                                            "not vectorized:"
3802                                            " compilation time alias: %G%G",
3803                                            stmt_info_a->stmt,
3804                                            stmt_info_b->stmt);
3805         }
3806
3807       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3808                             access_size_a, align_a);
3809       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3810                             access_size_b, align_b);
3811       /* Canonicalize the order to be the one that's needed for accurate
3812          RAW, WAR and WAW flags, in cases where the data references are
3813          well-ordered.  The order doesn't really matter otherwise,
3814          but we might as well be consistent.  */
3815       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3816         std::swap (dr_a, dr_b);
3817
3818       dr_with_seg_len_pair_t dr_with_seg_len_pair
3819         (dr_a, dr_b, (preserves_scalar_order_p
3820                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3821                       : dr_with_seg_len_pair_t::REORDERED));
3822
3823       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3824     }
3825
3826   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3827
3828   unsigned int count = (comp_alias_ddrs.length ()
3829                         + check_unequal_addrs.length ());
3830
3831   if (count
3832       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3833           == VECT_COST_MODEL_VERY_CHEAP))
3834     return opt_result::failure_at
3835       (vect_location, "would need a runtime alias check\n");
3836
3837   if (dump_enabled_p ())
3838     dump_printf_loc (MSG_NOTE, vect_location,
3839                      "improved number of alias checks from %d to %d\n",
3840                      may_alias_ddrs.length (), count);
3841   unsigned limit = param_vect_max_version_for_alias_checks;
3842   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3843     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3844   if (count > limit)
3845     return opt_result::failure_at
3846       (vect_location,
3847        "number of versioning for alias run-time tests exceeds %d "
3848        "(--param vect-max-version-for-alias-checks)\n", limit);
3849
3850   return opt_result::success ();
3851 }
3852
3853 /* Check whether we can use an internal function for a gather load
3854    or scatter store.  READ_P is true for loads and false for stores.
3855    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3856    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3857    is the type of the offset that is being applied to the invariant
3858    base address.  SCALE is the amount by which the offset should
3859    be multiplied *after* it has been converted to address width.
3860
3861    Return true if the function is supported, storing the function id in
3862    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3863
3864 bool
3865 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3866                           tree vectype, tree memory_type, tree offset_type,
3867                           int scale, internal_fn *ifn_out,
3868                           tree *offset_vectype_out)
3869 {
3870   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3871   unsigned int element_bits = vector_element_bits (vectype);
3872   if (element_bits != memory_bits)
3873     /* For now the vector elements must be the same width as the
3874        memory elements.  */
3875     return false;
3876
3877   /* Work out which function we need.  */
3878   internal_fn ifn, alt_ifn;
3879   if (read_p)
3880     {
3881       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3882       alt_ifn = IFN_MASK_GATHER_LOAD;
3883     }
3884   else
3885     {
3886       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3887       alt_ifn = IFN_MASK_SCATTER_STORE;
3888     }
3889
3890   for (;;)
3891     {
3892       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3893       if (!offset_vectype)
3894         return false;
3895
3896       /* Test whether the target supports this combination.  */
3897       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3898                                                   offset_vectype, scale))
3899         {
3900           *ifn_out = ifn;
3901           *offset_vectype_out = offset_vectype;
3902           return true;
3903         }
3904       else if (!masked_p
3905                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3906                                                           memory_type,
3907                                                           offset_vectype,
3908                                                           scale))
3909         {
3910           *ifn_out = alt_ifn;
3911           *offset_vectype_out = offset_vectype;
3912           return true;
3913         }
3914
3915       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3916           && TYPE_PRECISION (offset_type) >= element_bits)
3917         return false;
3918
3919       offset_type = build_nonstandard_integer_type
3920         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3921     }
3922 }
3923
3924 /* STMT_INFO is a call to an internal gather load or scatter store function.
3925    Describe the operation in INFO.  */
3926
3927 static void
3928 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3929                                    gather_scatter_info *info)
3930 {
3931   gcall *call = as_a <gcall *> (stmt_info->stmt);
3932   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3933   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3934
3935   info->ifn = gimple_call_internal_fn (call);
3936   info->decl = NULL_TREE;
3937   info->base = gimple_call_arg (call, 0);
3938   info->offset = gimple_call_arg (call, 1);
3939   info->offset_dt = vect_unknown_def_type;
3940   info->offset_vectype = NULL_TREE;
3941   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3942   info->element_type = TREE_TYPE (vectype);
3943   info->memory_type = TREE_TYPE (DR_REF (dr));
3944 }
3945
3946 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3947    gather load or scatter store.  Describe the operation in *INFO if so.  */
3948
3949 bool
3950 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3951                            gather_scatter_info *info)
3952 {
3953   HOST_WIDE_INT scale = 1;
3954   poly_int64 pbitpos, pbitsize;
3955   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3956   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3957   tree offtype = NULL_TREE;
3958   tree decl = NULL_TREE, base, off;
3959   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3960   tree memory_type = TREE_TYPE (DR_REF (dr));
3961   machine_mode pmode;
3962   int punsignedp, reversep, pvolatilep = 0;
3963   internal_fn ifn;
3964   tree offset_vectype;
3965   bool masked_p = false;
3966
3967   /* See whether this is already a call to a gather/scatter internal function.
3968      If not, see whether it's a masked load or store.  */
3969   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3970   if (call && gimple_call_internal_p (call))
3971     {
3972       ifn = gimple_call_internal_fn (call);
3973       if (internal_gather_scatter_fn_p (ifn))
3974         {
3975           vect_describe_gather_scatter_call (stmt_info, info);
3976           return true;
3977         }
3978       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3979     }
3980
3981   /* True if we should aim to use internal functions rather than
3982      built-in functions.  */
3983   bool use_ifn_p = (DR_IS_READ (dr)
3984                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3985                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
3986
3987   base = DR_REF (dr);
3988   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3989      see if we can use the def stmt of the address.  */
3990   if (masked_p
3991       && TREE_CODE (base) == MEM_REF
3992       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3993       && integer_zerop (TREE_OPERAND (base, 1))
3994       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3995     {
3996       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3997       if (is_gimple_assign (def_stmt)
3998           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3999         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4000     }
4001
4002   /* The gather and scatter builtins need address of the form
4003      loop_invariant + vector * {1, 2, 4, 8}
4004      or
4005      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4006      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4007      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4008      multiplications and additions in it.  To get a vector, we need
4009      a single SSA_NAME that will be defined in the loop and will
4010      contain everything that is not loop invariant and that can be
4011      vectorized.  The following code attempts to find such a preexistng
4012      SSA_NAME OFF and put the loop invariants into a tree BASE
4013      that can be gimplified before the loop.  */
4014   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4015                               &punsignedp, &reversep, &pvolatilep);
4016   if (reversep)
4017     return false;
4018
4019   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4020
4021   if (TREE_CODE (base) == MEM_REF)
4022     {
4023       if (!integer_zerop (TREE_OPERAND (base, 1)))
4024         {
4025           if (off == NULL_TREE)
4026             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4027           else
4028             off = size_binop (PLUS_EXPR, off,
4029                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4030         }
4031       base = TREE_OPERAND (base, 0);
4032     }
4033   else
4034     base = build_fold_addr_expr (base);
4035
4036   if (off == NULL_TREE)
4037     off = size_zero_node;
4038
4039   /* If base is not loop invariant, either off is 0, then we start with just
4040      the constant offset in the loop invariant BASE and continue with base
4041      as OFF, otherwise give up.
4042      We could handle that case by gimplifying the addition of base + off
4043      into some SSA_NAME and use that as off, but for now punt.  */
4044   if (!expr_invariant_in_loop_p (loop, base))
4045     {
4046       if (!integer_zerop (off))
4047         return false;
4048       off = base;
4049       base = size_int (pbytepos);
4050     }
4051   /* Otherwise put base + constant offset into the loop invariant BASE
4052      and continue with OFF.  */
4053   else
4054     {
4055       base = fold_convert (sizetype, base);
4056       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4057     }
4058
4059   /* OFF at this point may be either a SSA_NAME or some tree expression
4060      from get_inner_reference.  Try to peel off loop invariants from it
4061      into BASE as long as possible.  */
4062   STRIP_NOPS (off);
4063   while (offtype == NULL_TREE)
4064     {
4065       enum tree_code code;
4066       tree op0, op1, add = NULL_TREE;
4067
4068       if (TREE_CODE (off) == SSA_NAME)
4069         {
4070           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4071
4072           if (expr_invariant_in_loop_p (loop, off))
4073             return false;
4074
4075           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4076             break;
4077
4078           op0 = gimple_assign_rhs1 (def_stmt);
4079           code = gimple_assign_rhs_code (def_stmt);
4080           op1 = gimple_assign_rhs2 (def_stmt);
4081         }
4082       else
4083         {
4084           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4085             return false;
4086           code = TREE_CODE (off);
4087           extract_ops_from_tree (off, &code, &op0, &op1);
4088         }
4089       switch (code)
4090         {
4091         case POINTER_PLUS_EXPR:
4092         case PLUS_EXPR:
4093           if (expr_invariant_in_loop_p (loop, op0))
4094             {
4095               add = op0;
4096               off = op1;
4097             do_add:
4098               add = fold_convert (sizetype, add);
4099               if (scale != 1)
4100                 add = size_binop (MULT_EXPR, add, size_int (scale));
4101               base = size_binop (PLUS_EXPR, base, add);
4102               continue;
4103             }
4104           if (expr_invariant_in_loop_p (loop, op1))
4105             {
4106               add = op1;
4107               off = op0;
4108               goto do_add;
4109             }
4110           break;
4111         case MINUS_EXPR:
4112           if (expr_invariant_in_loop_p (loop, op1))
4113             {
4114               add = fold_convert (sizetype, op1);
4115               add = size_binop (MINUS_EXPR, size_zero_node, add);
4116               off = op0;
4117               goto do_add;
4118             }
4119           break;
4120         case MULT_EXPR:
4121           if (scale == 1 && tree_fits_shwi_p (op1))
4122             {
4123               int new_scale = tree_to_shwi (op1);
4124               /* Only treat this as a scaling operation if the target
4125                  supports it for at least some offset type.  */
4126               if (use_ifn_p
4127                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4128                                                 masked_p, vectype, memory_type,
4129                                                 signed_char_type_node,
4130                                                 new_scale, &ifn,
4131                                                 &offset_vectype)
4132                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4133                                                 masked_p, vectype, memory_type,
4134                                                 unsigned_char_type_node,
4135                                                 new_scale, &ifn,
4136                                                 &offset_vectype))
4137                 break;
4138               scale = new_scale;
4139               off = op0;
4140               continue;
4141             }
4142           break;
4143         case SSA_NAME:
4144           off = op0;
4145           continue;
4146         CASE_CONVERT:
4147           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4148               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4149             break;
4150
4151           /* Don't include the conversion if the target is happy with
4152              the current offset type.  */
4153           if (use_ifn_p
4154               && !POINTER_TYPE_P (TREE_TYPE (off))
4155               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4156                                            masked_p, vectype, memory_type,
4157                                            TREE_TYPE (off), scale, &ifn,
4158                                            &offset_vectype))
4159             break;
4160
4161           if (TYPE_PRECISION (TREE_TYPE (op0))
4162               == TYPE_PRECISION (TREE_TYPE (off)))
4163             {
4164               off = op0;
4165               continue;
4166             }
4167
4168           /* Include the conversion if it is widening and we're using
4169              the IFN path or the target can handle the converted from
4170              offset or the current size is not already the same as the
4171              data vector element size.  */
4172           if ((TYPE_PRECISION (TREE_TYPE (op0))
4173                < TYPE_PRECISION (TREE_TYPE (off)))
4174               && (use_ifn_p
4175                   || (DR_IS_READ (dr)
4176                       ? (targetm.vectorize.builtin_gather
4177                          && targetm.vectorize.builtin_gather (vectype,
4178                                                               TREE_TYPE (op0),
4179                                                               scale))
4180                       : (targetm.vectorize.builtin_scatter
4181                          && targetm.vectorize.builtin_scatter (vectype,
4182                                                                TREE_TYPE (op0),
4183                                                                scale)))
4184                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4185                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4186             {
4187               off = op0;
4188               offtype = TREE_TYPE (off);
4189               STRIP_NOPS (off);
4190               continue;
4191             }
4192           break;
4193         default:
4194           break;
4195         }
4196       break;
4197     }
4198
4199   /* If at the end OFF still isn't a SSA_NAME or isn't
4200      defined in the loop, punt.  */
4201   if (TREE_CODE (off) != SSA_NAME
4202       || expr_invariant_in_loop_p (loop, off))
4203     return false;
4204
4205   if (offtype == NULL_TREE)
4206     offtype = TREE_TYPE (off);
4207
4208   if (use_ifn_p)
4209     {
4210       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4211                                      vectype, memory_type, offtype, scale,
4212                                      &ifn, &offset_vectype))
4213         ifn = IFN_LAST;
4214       decl = NULL_TREE;
4215     }
4216   else
4217     {
4218       if (DR_IS_READ (dr))
4219         {
4220           if (targetm.vectorize.builtin_gather)
4221             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4222         }
4223       else
4224         {
4225           if (targetm.vectorize.builtin_scatter)
4226             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4227         }
4228       ifn = IFN_LAST;
4229       /* The offset vector type will be read from DECL when needed.  */
4230       offset_vectype = NULL_TREE;
4231     }
4232
4233   info->ifn = ifn;
4234   info->decl = decl;
4235   info->base = base;
4236   info->offset = off;
4237   info->offset_dt = vect_unknown_def_type;
4238   info->offset_vectype = offset_vectype;
4239   info->scale = scale;
4240   info->element_type = TREE_TYPE (vectype);
4241   info->memory_type = memory_type;
4242   return true;
4243 }
4244
4245 /* Find the data references in STMT, analyze them with respect to LOOP and
4246    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4247    be handled.  */
4248
4249 opt_result
4250 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4251                                vec<data_reference_p> *datarefs,
4252                                vec<int> *dataref_groups, int group_id)
4253 {
4254   /* We can ignore clobbers for dataref analysis - they are removed during
4255      loop vectorization and BB vectorization checks dependences with a
4256      stmt walk.  */
4257   if (gimple_clobber_p (stmt))
4258     return opt_result::success ();
4259
4260   if (gimple_has_volatile_ops (stmt))
4261     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4262                                    stmt);
4263
4264   if (stmt_can_throw_internal (cfun, stmt))
4265     return opt_result::failure_at (stmt,
4266                                    "not vectorized:"
4267                                    " statement can throw an exception: %G",
4268                                    stmt);
4269
4270   auto_vec<data_reference_p, 2> refs;
4271   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4272   if (!res)
4273     return res;
4274
4275   if (refs.is_empty ())
4276     return opt_result::success ();
4277
4278   if (refs.length () > 1)
4279     {
4280       while (!refs.is_empty ())
4281         free_data_ref (refs.pop ());
4282       return opt_result::failure_at (stmt,
4283                                      "not vectorized: more than one "
4284                                      "data ref in stmt: %G", stmt);
4285     }
4286
4287   data_reference_p dr = refs.pop ();
4288   if (gcall *call = dyn_cast <gcall *> (stmt))
4289     if (!gimple_call_internal_p (call)
4290         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4291             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4292       {
4293         free_data_ref (dr);
4294         return opt_result::failure_at (stmt,
4295                                        "not vectorized: dr in a call %G", stmt);
4296       }
4297
4298   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4299       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4300     {
4301       free_data_ref (dr);
4302       return opt_result::failure_at (stmt,
4303                                      "not vectorized:"
4304                                      " statement is bitfield access %G", stmt);
4305     }
4306
4307   if (DR_BASE_ADDRESS (dr)
4308       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4309     {
4310       free_data_ref (dr);
4311       return opt_result::failure_at (stmt,
4312                                      "not vectorized:"
4313                                      " base addr of dr is a constant\n");
4314     }
4315
4316   /* Check whether this may be a SIMD lane access and adjust the
4317      DR to make it easier for us to handle it.  */
4318   if (loop
4319       && loop->simduid
4320       && (!DR_BASE_ADDRESS (dr)
4321           || !DR_OFFSET (dr)
4322           || !DR_INIT (dr)
4323           || !DR_STEP (dr)))
4324     {
4325       struct data_reference *newdr
4326         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4327                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4328       if (DR_BASE_ADDRESS (newdr)
4329           && DR_OFFSET (newdr)
4330           && DR_INIT (newdr)
4331           && DR_STEP (newdr)
4332           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4333           && integer_zerop (DR_STEP (newdr)))
4334         {
4335           tree base_address = DR_BASE_ADDRESS (newdr);
4336           tree off = DR_OFFSET (newdr);
4337           tree step = ssize_int (1);
4338           if (integer_zerop (off)
4339               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4340             {
4341               off = TREE_OPERAND (base_address, 1);
4342               base_address = TREE_OPERAND (base_address, 0);
4343             }
4344           STRIP_NOPS (off);
4345           if (TREE_CODE (off) == MULT_EXPR
4346               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4347             {
4348               step = TREE_OPERAND (off, 1);
4349               off = TREE_OPERAND (off, 0);
4350               STRIP_NOPS (off);
4351             }
4352           if (CONVERT_EXPR_P (off)
4353               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4354                   < TYPE_PRECISION (TREE_TYPE (off))))
4355             off = TREE_OPERAND (off, 0);
4356           if (TREE_CODE (off) == SSA_NAME)
4357             {
4358               gimple *def = SSA_NAME_DEF_STMT (off);
4359               /* Look through widening conversion.  */
4360               if (is_gimple_assign (def)
4361                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4362                 {
4363                   tree rhs1 = gimple_assign_rhs1 (def);
4364                   if (TREE_CODE (rhs1) == SSA_NAME
4365                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4366                       && (TYPE_PRECISION (TREE_TYPE (off))
4367                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4368                     def = SSA_NAME_DEF_STMT (rhs1);
4369                 }
4370               if (is_gimple_call (def)
4371                   && gimple_call_internal_p (def)
4372                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4373                 {
4374                   tree arg = gimple_call_arg (def, 0);
4375                   tree reft = TREE_TYPE (DR_REF (newdr));
4376                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4377                   arg = SSA_NAME_VAR (arg);
4378                   if (arg == loop->simduid
4379                       /* For now.  */
4380                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4381                     {
4382                       DR_BASE_ADDRESS (newdr) = base_address;
4383                       DR_OFFSET (newdr) = ssize_int (0);
4384                       DR_STEP (newdr) = step;
4385                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4386                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4387                       /* Mark as simd-lane access.  */
4388                       tree arg2 = gimple_call_arg (def, 1);
4389                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4390                       free_data_ref (dr);
4391                       datarefs->safe_push (newdr);
4392                       if (dataref_groups)
4393                         dataref_groups->safe_push (group_id);
4394                       return opt_result::success ();
4395                     }
4396                 }
4397             }
4398         }
4399       free_data_ref (newdr);
4400     }
4401
4402   datarefs->safe_push (dr);
4403   if (dataref_groups)
4404     dataref_groups->safe_push (group_id);
4405   return opt_result::success ();
4406 }
4407
4408 /* Function vect_analyze_data_refs.
4409
4410   Find all the data references in the loop or basic block.
4411
4412    The general structure of the analysis of data refs in the vectorizer is as
4413    follows:
4414    1- vect_analyze_data_refs(loop/bb): call
4415       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4416       in the loop/bb and their dependences.
4417    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4418    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4419    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4420
4421 */
4422
4423 opt_result
4424 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4425 {
4426   class loop *loop = NULL;
4427   unsigned int i;
4428   struct data_reference *dr;
4429   tree scalar_type;
4430
4431   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4432
4433   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4434     loop = LOOP_VINFO_LOOP (loop_vinfo);
4435
4436   /* Go through the data-refs, check that the analysis succeeded.  Update
4437      pointer from stmt_vec_info struct to DR and vectype.  */
4438
4439   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4440   FOR_EACH_VEC_ELT (datarefs, i, dr)
4441     {
4442       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4443       poly_uint64 vf;
4444
4445       gcc_assert (DR_REF (dr));
4446       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4447       gcc_assert (!stmt_info->dr_aux.dr);
4448       stmt_info->dr_aux.dr = dr;
4449       stmt_info->dr_aux.stmt = stmt_info;
4450
4451       /* Check that analysis of the data-ref succeeded.  */
4452       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4453           || !DR_STEP (dr))
4454         {
4455           bool maybe_gather
4456             = DR_IS_READ (dr)
4457               && !TREE_THIS_VOLATILE (DR_REF (dr));
4458           bool maybe_scatter
4459             = DR_IS_WRITE (dr)
4460               && !TREE_THIS_VOLATILE (DR_REF (dr))
4461               && (targetm.vectorize.builtin_scatter != NULL
4462                   || supports_vec_scatter_store_p ());
4463
4464           /* If target supports vector gather loads or scatter stores,
4465              see if they can't be used.  */
4466           if (is_a <loop_vec_info> (vinfo)
4467               && !nested_in_vect_loop_p (loop, stmt_info))
4468             {
4469               if (maybe_gather || maybe_scatter)
4470                 {
4471                   if (maybe_gather)
4472                     gatherscatter = GATHER;
4473                   else
4474                     gatherscatter = SCATTER;
4475                 }
4476             }
4477
4478           if (gatherscatter == SG_NONE)
4479             {
4480               if (dump_enabled_p ())
4481                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4482                                  "not vectorized: data ref analysis "
4483                                  "failed %G", stmt_info->stmt);
4484               if (is_a <bb_vec_info> (vinfo))
4485                 {
4486                   /* In BB vectorization the ref can still participate
4487                      in dependence analysis, we just can't vectorize it.  */
4488                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4489                   continue;
4490                 }
4491               return opt_result::failure_at (stmt_info->stmt,
4492                                              "not vectorized:"
4493                                              " data ref analysis failed: %G",
4494                                              stmt_info->stmt);
4495             }
4496         }
4497
4498       /* See if this was detected as SIMD lane access.  */
4499       if (dr->aux == (void *)-1
4500           || dr->aux == (void *)-2
4501           || dr->aux == (void *)-3
4502           || dr->aux == (void *)-4)
4503         {
4504           if (nested_in_vect_loop_p (loop, stmt_info))
4505             return opt_result::failure_at (stmt_info->stmt,
4506                                            "not vectorized:"
4507                                            " data ref analysis failed: %G",
4508                                            stmt_info->stmt);
4509           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4510             = -(uintptr_t) dr->aux;
4511         }
4512
4513       tree base = get_base_address (DR_REF (dr));
4514       if (base && VAR_P (base) && DECL_NONALIASED (base))
4515         {
4516           if (dump_enabled_p ())
4517             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4518                              "not vectorized: base object not addressable "
4519                              "for stmt: %G", stmt_info->stmt);
4520           if (is_a <bb_vec_info> (vinfo))
4521             {
4522               /* In BB vectorization the ref can still participate
4523                  in dependence analysis, we just can't vectorize it.  */
4524               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4525               continue;
4526             }
4527           return opt_result::failure_at (stmt_info->stmt,
4528                                          "not vectorized: base object not"
4529                                          " addressable for stmt: %G",
4530                                          stmt_info->stmt);
4531         }
4532
4533       if (is_a <loop_vec_info> (vinfo)
4534           && DR_STEP (dr)
4535           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4536         {
4537           if (nested_in_vect_loop_p (loop, stmt_info))
4538             return opt_result::failure_at (stmt_info->stmt,
4539                                            "not vectorized: "
4540                                            "not suitable for strided load %G",
4541                                            stmt_info->stmt);
4542           STMT_VINFO_STRIDED_P (stmt_info) = true;
4543         }
4544
4545       /* Update DR field in stmt_vec_info struct.  */
4546
4547       /* If the dataref is in an inner-loop of the loop that is considered for
4548          for vectorization, we also want to analyze the access relative to
4549          the outer-loop (DR contains information only relative to the
4550          inner-most enclosing loop).  We do that by building a reference to the
4551          first location accessed by the inner-loop, and analyze it relative to
4552          the outer-loop.  */
4553       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4554         {
4555           /* Build a reference to the first location accessed by the
4556              inner loop: *(BASE + INIT + OFFSET).  By construction,
4557              this address must be invariant in the inner loop, so we
4558              can consider it as being used in the outer loop.  */
4559           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4560           tree offset = unshare_expr (DR_OFFSET (dr));
4561           tree init = unshare_expr (DR_INIT (dr));
4562           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4563                                           init, offset);
4564           tree init_addr = fold_build_pointer_plus (base, init_offset);
4565           tree init_ref = build_fold_indirect_ref (init_addr);
4566
4567           if (dump_enabled_p ())
4568             dump_printf_loc (MSG_NOTE, vect_location,
4569                              "analyze in outer loop: %T\n", init_ref);
4570
4571           opt_result res
4572             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4573                                     init_ref, loop, stmt_info->stmt);
4574           if (!res)
4575             /* dr_analyze_innermost already explained the failure.  */
4576             return res;
4577
4578           if (dump_enabled_p ())
4579             dump_printf_loc (MSG_NOTE, vect_location,
4580                              "\touter base_address: %T\n"
4581                              "\touter offset from base address: %T\n"
4582                              "\touter constant offset from base address: %T\n"
4583                              "\touter step: %T\n"
4584                              "\touter base alignment: %d\n\n"
4585                              "\touter base misalignment: %d\n"
4586                              "\touter offset alignment: %d\n"
4587                              "\touter step alignment: %d\n",
4588                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4589                              STMT_VINFO_DR_OFFSET (stmt_info),
4590                              STMT_VINFO_DR_INIT (stmt_info),
4591                              STMT_VINFO_DR_STEP (stmt_info),
4592                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4593                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4594                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4595                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4596         }
4597
4598       /* Set vectype for STMT.  */
4599       scalar_type = TREE_TYPE (DR_REF (dr));
4600       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4601       if (!vectype)
4602         {
4603           if (dump_enabled_p ())
4604             {
4605               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4606                                "not vectorized: no vectype for stmt: %G",
4607                                stmt_info->stmt);
4608               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4609               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4610                                  scalar_type);
4611               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4612             }
4613
4614           if (is_a <bb_vec_info> (vinfo))
4615             {
4616               /* No vector type is fine, the ref can still participate
4617                  in dependence analysis, we just can't vectorize it.  */
4618               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4619               continue;
4620             }
4621           if (fatal)
4622             *fatal = false;
4623           return opt_result::failure_at (stmt_info->stmt,
4624                                          "not vectorized:"
4625                                          " no vectype for stmt: %G"
4626                                          " scalar_type: %T\n",
4627                                          stmt_info->stmt, scalar_type);
4628         }
4629       else
4630         {
4631           if (dump_enabled_p ())
4632             dump_printf_loc (MSG_NOTE, vect_location,
4633                              "got vectype for stmt: %G%T\n",
4634                              stmt_info->stmt, vectype);
4635         }
4636
4637       /* Adjust the minimal vectorization factor according to the
4638          vector type.  */
4639       vf = TYPE_VECTOR_SUBPARTS (vectype);
4640       *min_vf = upper_bound (*min_vf, vf);
4641
4642       /* Leave the BB vectorizer to pick the vector type later, based on
4643          the final dataref group size and SLP node size.  */
4644       if (is_a <loop_vec_info> (vinfo))
4645         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4646
4647       if (gatherscatter != SG_NONE)
4648         {
4649           gather_scatter_info gs_info;
4650           if (!vect_check_gather_scatter (stmt_info,
4651                                           as_a <loop_vec_info> (vinfo),
4652                                           &gs_info)
4653               || !get_vectype_for_scalar_type (vinfo,
4654                                                TREE_TYPE (gs_info.offset)))
4655             {
4656               if (fatal)
4657                 *fatal = false;
4658               return opt_result::failure_at
4659                         (stmt_info->stmt,
4660                          (gatherscatter == GATHER)
4661                          ? "not vectorized: not suitable for gather load %G"
4662                          : "not vectorized: not suitable for scatter store %G",
4663                          stmt_info->stmt);
4664             }
4665           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4666         }
4667     }
4668
4669   /* We used to stop processing and prune the list here.  Verify we no
4670      longer need to.  */
4671   gcc_assert (i == datarefs.length ());
4672
4673   return opt_result::success ();
4674 }
4675
4676
4677 /* Function vect_get_new_vect_var.
4678
4679    Returns a name for a new variable.  The current naming scheme appends the
4680    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4681    the name of vectorizer generated variables, and appends that to NAME if
4682    provided.  */
4683
4684 tree
4685 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4686 {
4687   const char *prefix;
4688   tree new_vect_var;
4689
4690   switch (var_kind)
4691   {
4692   case vect_simple_var:
4693     prefix = "vect";
4694     break;
4695   case vect_scalar_var:
4696     prefix = "stmp";
4697     break;
4698   case vect_mask_var:
4699     prefix = "mask";
4700     break;
4701   case vect_pointer_var:
4702     prefix = "vectp";
4703     break;
4704   default:
4705     gcc_unreachable ();
4706   }
4707
4708   if (name)
4709     {
4710       char* tmp = concat (prefix, "_", name, NULL);
4711       new_vect_var = create_tmp_reg (type, tmp);
4712       free (tmp);
4713     }
4714   else
4715     new_vect_var = create_tmp_reg (type, prefix);
4716
4717   return new_vect_var;
4718 }
4719
4720 /* Like vect_get_new_vect_var but return an SSA name.  */
4721
4722 tree
4723 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4724 {
4725   const char *prefix;
4726   tree new_vect_var;
4727
4728   switch (var_kind)
4729   {
4730   case vect_simple_var:
4731     prefix = "vect";
4732     break;
4733   case vect_scalar_var:
4734     prefix = "stmp";
4735     break;
4736   case vect_pointer_var:
4737     prefix = "vectp";
4738     break;
4739   default:
4740     gcc_unreachable ();
4741   }
4742
4743   if (name)
4744     {
4745       char* tmp = concat (prefix, "_", name, NULL);
4746       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4747       free (tmp);
4748     }
4749   else
4750     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4751
4752   return new_vect_var;
4753 }
4754
4755 /* Duplicate points-to info on NAME from DR_INFO.  */
4756
4757 static void
4758 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4759 {
4760   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4761   /* DR_PTR_INFO is for a base SSA name, not including constant or
4762      variable offsets in the ref so its alignment info does not apply.  */
4763   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4764 }
4765
4766 /* Function vect_create_addr_base_for_vector_ref.
4767
4768    Create an expression that computes the address of the first memory location
4769    that will be accessed for a data reference.
4770
4771    Input:
4772    STMT_INFO: The statement containing the data reference.
4773    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4774    OFFSET: Optional. If supplied, it is be added to the initial address.
4775    LOOP:    Specify relative to which loop-nest should the address be computed.
4776             For example, when the dataref is in an inner-loop nested in an
4777             outer-loop that is now being vectorized, LOOP can be either the
4778             outer-loop, or the inner-loop.  The first memory location accessed
4779             by the following dataref ('in' points to short):
4780
4781                 for (i=0; i<N; i++)
4782                    for (j=0; j<M; j++)
4783                      s += in[i+j]
4784
4785             is as follows:
4786             if LOOP=i_loop:     &in             (relative to i_loop)
4787             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4788
4789    Output:
4790    1. Return an SSA_NAME whose value is the address of the memory location of
4791       the first vector of the data reference.
4792    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4793       these statement(s) which define the returned SSA_NAME.
4794
4795    FORNOW: We are only handling array accesses with step 1.  */
4796
4797 tree
4798 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4799                                       gimple_seq *new_stmt_list,
4800                                       tree offset)
4801 {
4802   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4803   struct data_reference *dr = dr_info->dr;
4804   const char *base_name;
4805   tree addr_base;
4806   tree dest;
4807   gimple_seq seq = NULL;
4808   tree vect_ptr_type;
4809   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4810   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4811
4812   tree data_ref_base = unshare_expr (drb->base_address);
4813   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4814   tree init = unshare_expr (drb->init);
4815
4816   if (loop_vinfo)
4817     base_name = get_name (data_ref_base);
4818   else
4819     {
4820       base_offset = ssize_int (0);
4821       init = ssize_int (0);
4822       base_name = get_name (DR_REF (dr));
4823     }
4824
4825   /* Create base_offset */
4826   base_offset = size_binop (PLUS_EXPR,
4827                             fold_convert (sizetype, base_offset),
4828                             fold_convert (sizetype, init));
4829
4830   if (offset)
4831     {
4832       offset = fold_convert (sizetype, offset);
4833       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4834                                  base_offset, offset);
4835     }
4836
4837   /* base + base_offset */
4838   if (loop_vinfo)
4839     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4840   else
4841     {
4842       addr_base = build1 (ADDR_EXPR,
4843                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4844                           unshare_expr (DR_REF (dr)));
4845     }
4846
4847   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4848   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4849   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4850   gimple_seq_add_seq (new_stmt_list, seq);
4851
4852   if (DR_PTR_INFO (dr)
4853       && TREE_CODE (addr_base) == SSA_NAME
4854       /* We should only duplicate pointer info to newly created SSA names.  */
4855       && SSA_NAME_VAR (addr_base) == dest)
4856     {
4857       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4858       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4859     }
4860
4861   if (dump_enabled_p ())
4862     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4863
4864   return addr_base;
4865 }
4866
4867
4868 /* Function vect_create_data_ref_ptr.
4869
4870    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4871    location accessed in the loop by STMT_INFO, along with the def-use update
4872    chain to appropriately advance the pointer through the loop iterations.
4873    Also set aliasing information for the pointer.  This pointer is used by
4874    the callers to this function to create a memory reference expression for
4875    vector load/store access.
4876
4877    Input:
4878    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4879          GIMPLE_ASSIGN <name, data-ref> or
4880          GIMPLE_ASSIGN <data-ref, name>.
4881    2. AGGR_TYPE: the type of the reference, which should be either a vector
4882         or an array.
4883    3. AT_LOOP: the loop where the vector memref is to be created.
4884    4. OFFSET (optional): a byte offset to be added to the initial address
4885         accessed by the data-ref in STMT_INFO.
4886    5. BSI: location where the new stmts are to be placed if there is no loop
4887    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4888         pointing to the initial address.
4889    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4890         to the IV during each iteration of the loop.  NULL says to move
4891         by one copy of AGGR_TYPE up or down, depending on the step of the
4892         data reference.
4893
4894    Output:
4895    1. Declare a new ptr to vector_type, and have it point to the base of the
4896       data reference (initial addressed accessed by the data reference).
4897       For example, for vector of type V8HI, the following code is generated:
4898
4899       v8hi *ap;
4900       ap = (v8hi *)initial_address;
4901
4902       if OFFSET is not supplied:
4903          initial_address = &a[init];
4904       if OFFSET is supplied:
4905          initial_address = &a[init] + OFFSET;
4906       if BYTE_OFFSET is supplied:
4907          initial_address = &a[init] + BYTE_OFFSET;
4908
4909       Return the initial_address in INITIAL_ADDRESS.
4910
4911    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4912       update the pointer in each iteration of the loop.
4913
4914       Return the increment stmt that updates the pointer in PTR_INCR.
4915
4916    3. Return the pointer.  */
4917
4918 tree
4919 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4920                           tree aggr_type, class loop *at_loop, tree offset,
4921                           tree *initial_address, gimple_stmt_iterator *gsi,
4922                           gimple **ptr_incr, bool only_init,
4923                           tree iv_step)
4924 {
4925   const char *base_name;
4926   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4927   class loop *loop = NULL;
4928   bool nested_in_vect_loop = false;
4929   class loop *containing_loop = NULL;
4930   tree aggr_ptr_type;
4931   tree aggr_ptr;
4932   tree new_temp;
4933   gimple_seq new_stmt_list = NULL;
4934   edge pe = NULL;
4935   basic_block new_bb;
4936   tree aggr_ptr_init;
4937   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4938   struct data_reference *dr = dr_info->dr;
4939   tree aptr;
4940   gimple_stmt_iterator incr_gsi;
4941   bool insert_after;
4942   tree indx_before_incr, indx_after_incr;
4943   gimple *incr;
4944   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4945
4946   gcc_assert (iv_step != NULL_TREE
4947               || TREE_CODE (aggr_type) == ARRAY_TYPE
4948               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4949
4950   if (loop_vinfo)
4951     {
4952       loop = LOOP_VINFO_LOOP (loop_vinfo);
4953       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4954       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4955       pe = loop_preheader_edge (loop);
4956     }
4957   else
4958     {
4959       gcc_assert (bb_vinfo);
4960       only_init = true;
4961       *ptr_incr = NULL;
4962     }
4963
4964   /* Create an expression for the first address accessed by this load
4965      in LOOP.  */
4966   base_name = get_name (DR_BASE_ADDRESS (dr));
4967
4968   if (dump_enabled_p ())
4969     {
4970       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4971       dump_printf_loc (MSG_NOTE, vect_location,
4972                        "create %s-pointer variable to type: %T",
4973                        get_tree_code_name (TREE_CODE (aggr_type)),
4974                        aggr_type);
4975       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4976         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4977       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4978         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4979       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4980         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4981       else
4982         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4983       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4984     }
4985
4986   /* (1) Create the new aggregate-pointer variable.
4987      Vector and array types inherit the alias set of their component
4988      type by default so we need to use a ref-all pointer if the data
4989      reference does not conflict with the created aggregated data
4990      reference because it is not addressable.  */
4991   bool need_ref_all = false;
4992   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4993                               get_alias_set (DR_REF (dr))))
4994     need_ref_all = true;
4995   /* Likewise for any of the data references in the stmt group.  */
4996   else if (DR_GROUP_SIZE (stmt_info) > 1)
4997     {
4998       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4999       do
5000         {
5001           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5002           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5003                                       get_alias_set (DR_REF (sdr))))
5004             {
5005               need_ref_all = true;
5006               break;
5007             }
5008           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5009         }
5010       while (sinfo);
5011     }
5012   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5013                                                need_ref_all);
5014   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5015
5016
5017   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5018      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5019      def-use update cycles for the pointer: one relative to the outer-loop
5020      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5021      to the inner-loop (which is the inner-most loop containing the dataref),
5022      and this is done be step (5) below.
5023
5024      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5025      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5026      redundant.  Steps (3),(4) create the following:
5027
5028         vp0 = &base_addr;
5029         LOOP:   vp1 = phi(vp0,vp2)
5030                 ...
5031                 ...
5032                 vp2 = vp1 + step
5033                 goto LOOP
5034
5035      If there is an inner-loop nested in loop, then step (5) will also be
5036      applied, and an additional update in the inner-loop will be created:
5037
5038         vp0 = &base_addr;
5039         LOOP:   vp1 = phi(vp0,vp2)
5040                 ...
5041         inner:     vp3 = phi(vp1,vp4)
5042                    vp4 = vp3 + inner_step
5043                    if () goto inner
5044                 ...
5045                 vp2 = vp1 + step
5046                 if () goto LOOP   */
5047
5048   /* (2) Calculate the initial address of the aggregate-pointer, and set
5049      the aggregate-pointer to point to it before the loop.  */
5050
5051   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5052
5053   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5054                                                    stmt_info, &new_stmt_list,
5055                                                    offset);
5056   if (new_stmt_list)
5057     {
5058       if (pe)
5059         {
5060           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5061           gcc_assert (!new_bb);
5062         }
5063       else
5064         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5065     }
5066
5067   *initial_address = new_temp;
5068   aggr_ptr_init = new_temp;
5069
5070   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5071      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5072      inner-loop nested in LOOP (during outer-loop vectorization).  */
5073
5074   /* No update in loop is required.  */
5075   if (only_init && (!loop_vinfo || at_loop == loop))
5076     aptr = aggr_ptr_init;
5077   else
5078     {
5079       /* Accesses to invariant addresses should be handled specially
5080          by the caller.  */
5081       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5082       gcc_assert (!integer_zerop (step));
5083
5084       if (iv_step == NULL_TREE)
5085         {
5086           /* The step of the aggregate pointer is the type size,
5087              negated for downward accesses.  */
5088           iv_step = TYPE_SIZE_UNIT (aggr_type);
5089           if (tree_int_cst_sgn (step) == -1)
5090             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5091         }
5092
5093       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5094
5095       create_iv (aggr_ptr_init,
5096                  fold_convert (aggr_ptr_type, iv_step),
5097                  aggr_ptr, loop, &incr_gsi, insert_after,
5098                  &indx_before_incr, &indx_after_incr);
5099       incr = gsi_stmt (incr_gsi);
5100
5101       /* Copy the points-to information if it exists. */
5102       if (DR_PTR_INFO (dr))
5103         {
5104           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5105           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5106         }
5107       if (ptr_incr)
5108         *ptr_incr = incr;
5109
5110       aptr = indx_before_incr;
5111     }
5112
5113   if (!nested_in_vect_loop || only_init)
5114     return aptr;
5115
5116
5117   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5118      nested in LOOP, if exists.  */
5119
5120   gcc_assert (nested_in_vect_loop);
5121   if (!only_init)
5122     {
5123       standard_iv_increment_position (containing_loop, &incr_gsi,
5124                                       &insert_after);
5125       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5126                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5127                  &indx_after_incr);
5128       incr = gsi_stmt (incr_gsi);
5129
5130       /* Copy the points-to information if it exists. */
5131       if (DR_PTR_INFO (dr))
5132         {
5133           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5134           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5135         }
5136       if (ptr_incr)
5137         *ptr_incr = incr;
5138
5139       return indx_before_incr;
5140     }
5141   else
5142     gcc_unreachable ();
5143 }
5144
5145
5146 /* Function bump_vector_ptr
5147
5148    Increment a pointer (to a vector type) by vector-size. If requested,
5149    i.e. if PTR-INCR is given, then also connect the new increment stmt
5150    to the existing def-use update-chain of the pointer, by modifying
5151    the PTR_INCR as illustrated below:
5152
5153    The pointer def-use update-chain before this function:
5154                         DATAREF_PTR = phi (p_0, p_2)
5155                         ....
5156         PTR_INCR:       p_2 = DATAREF_PTR + step
5157
5158    The pointer def-use update-chain after this function:
5159                         DATAREF_PTR = phi (p_0, p_2)
5160                         ....
5161                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5162                         ....
5163         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5164
5165    Input:
5166    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5167                  in the loop.
5168    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5169               the loop.  The increment amount across iterations is expected
5170               to be vector_size.
5171    BSI - location where the new update stmt is to be placed.
5172    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5173    BUMP - optional. The offset by which to bump the pointer. If not given,
5174           the offset is assumed to be vector_size.
5175
5176    Output: Return NEW_DATAREF_PTR as illustrated above.
5177
5178 */
5179
5180 tree
5181 bump_vector_ptr (vec_info *vinfo,
5182                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5183                  stmt_vec_info stmt_info, tree bump)
5184 {
5185   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5186   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5187   tree update = TYPE_SIZE_UNIT (vectype);
5188   gimple *incr_stmt;
5189   ssa_op_iter iter;
5190   use_operand_p use_p;
5191   tree new_dataref_ptr;
5192
5193   if (bump)
5194     update = bump;
5195
5196   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5197     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5198   else
5199     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5200   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5201                                    dataref_ptr, update);
5202   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5203   /* Fold the increment, avoiding excessive chains use-def chains of
5204      those, leading to compile-time issues for passes until the next
5205      forwprop pass which would do this as well.  */
5206   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5207   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5208     {
5209       incr_stmt = gsi_stmt (fold_gsi);
5210       update_stmt (incr_stmt);
5211     }
5212
5213   /* Copy the points-to information if it exists. */
5214   if (DR_PTR_INFO (dr))
5215     {
5216       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5217       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5218     }
5219
5220   if (!ptr_incr)
5221     return new_dataref_ptr;
5222
5223   /* Update the vector-pointer's cross-iteration increment.  */
5224   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5225     {
5226       tree use = USE_FROM_PTR (use_p);
5227
5228       if (use == dataref_ptr)
5229         SET_USE (use_p, new_dataref_ptr);
5230       else
5231         gcc_assert (operand_equal_p (use, update, 0));
5232     }
5233
5234   return new_dataref_ptr;
5235 }
5236
5237
5238 /* Copy memory reference info such as base/clique from the SRC reference
5239    to the DEST MEM_REF.  */
5240
5241 void
5242 vect_copy_ref_info (tree dest, tree src)
5243 {
5244   if (TREE_CODE (dest) != MEM_REF)
5245     return;
5246
5247   tree src_base = src;
5248   while (handled_component_p (src_base))
5249     src_base = TREE_OPERAND (src_base, 0);
5250   if (TREE_CODE (src_base) != MEM_REF
5251       && TREE_CODE (src_base) != TARGET_MEM_REF)
5252     return;
5253
5254   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5255   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5256 }
5257
5258
5259 /* Function vect_create_destination_var.
5260
5261    Create a new temporary of type VECTYPE.  */
5262
5263 tree
5264 vect_create_destination_var (tree scalar_dest, tree vectype)
5265 {
5266   tree vec_dest;
5267   const char *name;
5268   char *new_name;
5269   tree type;
5270   enum vect_var_kind kind;
5271
5272   kind = vectype
5273     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5274     ? vect_mask_var
5275     : vect_simple_var
5276     : vect_scalar_var;
5277   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5278
5279   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5280
5281   name = get_name (scalar_dest);
5282   if (name)
5283     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5284   else
5285     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5286   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5287   free (new_name);
5288
5289   return vec_dest;
5290 }
5291
5292 /* Function vect_grouped_store_supported.
5293
5294    Returns TRUE if interleave high and interleave low permutations
5295    are supported, and FALSE otherwise.  */
5296
5297 bool
5298 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5299 {
5300   machine_mode mode = TYPE_MODE (vectype);
5301
5302   /* vect_permute_store_chain requires the group size to be equal to 3 or
5303      be a power of two.  */
5304   if (count != 3 && exact_log2 (count) == -1)
5305     {
5306       if (dump_enabled_p ())
5307         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5308                          "the size of the group of accesses"
5309                          " is not a power of 2 or not eqaul to 3\n");
5310       return false;
5311     }
5312
5313   /* Check that the permutation is supported.  */
5314   if (VECTOR_MODE_P (mode))
5315     {
5316       unsigned int i;
5317       if (count == 3)
5318         {
5319           unsigned int j0 = 0, j1 = 0, j2 = 0;
5320           unsigned int i, j;
5321
5322           unsigned int nelt;
5323           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5324             {
5325               if (dump_enabled_p ())
5326                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5327                                  "cannot handle groups of 3 stores for"
5328                                  " variable-length vectors\n");
5329               return false;
5330             }
5331
5332           vec_perm_builder sel (nelt, nelt, 1);
5333           sel.quick_grow (nelt);
5334           vec_perm_indices indices;
5335           for (j = 0; j < 3; j++)
5336             {
5337               int nelt0 = ((3 - j) * nelt) % 3;
5338               int nelt1 = ((3 - j) * nelt + 1) % 3;
5339               int nelt2 = ((3 - j) * nelt + 2) % 3;
5340               for (i = 0; i < nelt; i++)
5341                 {
5342                   if (3 * i + nelt0 < nelt)
5343                     sel[3 * i + nelt0] = j0++;
5344                   if (3 * i + nelt1 < nelt)
5345                     sel[3 * i + nelt1] = nelt + j1++;
5346                   if (3 * i + nelt2 < nelt)
5347                     sel[3 * i + nelt2] = 0;
5348                 }
5349               indices.new_vector (sel, 2, nelt);
5350               if (!can_vec_perm_const_p (mode, indices))
5351                 {
5352                   if (dump_enabled_p ())
5353                     dump_printf (MSG_MISSED_OPTIMIZATION,
5354                                  "permutation op not supported by target.\n");
5355                   return false;
5356                 }
5357
5358               for (i = 0; i < nelt; i++)
5359                 {
5360                   if (3 * i + nelt0 < nelt)
5361                     sel[3 * i + nelt0] = 3 * i + nelt0;
5362                   if (3 * i + nelt1 < nelt)
5363                     sel[3 * i + nelt1] = 3 * i + nelt1;
5364                   if (3 * i + nelt2 < nelt)
5365                     sel[3 * i + nelt2] = nelt + j2++;
5366                 }
5367               indices.new_vector (sel, 2, nelt);
5368               if (!can_vec_perm_const_p (mode, indices))
5369                 {
5370                   if (dump_enabled_p ())
5371                     dump_printf (MSG_MISSED_OPTIMIZATION,
5372                                  "permutation op not supported by target.\n");
5373                   return false;
5374                 }
5375             }
5376           return true;
5377         }
5378       else
5379         {
5380           /* If length is not equal to 3 then only power of 2 is supported.  */
5381           gcc_assert (pow2p_hwi (count));
5382           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5383
5384           /* The encoding has 2 interleaved stepped patterns.  */
5385           vec_perm_builder sel (nelt, 2, 3);
5386           sel.quick_grow (6);
5387           for (i = 0; i < 3; i++)
5388             {
5389               sel[i * 2] = i;
5390               sel[i * 2 + 1] = i + nelt;
5391             }
5392           vec_perm_indices indices (sel, 2, nelt);
5393           if (can_vec_perm_const_p (mode, indices))
5394             {
5395               for (i = 0; i < 6; i++)
5396                 sel[i] += exact_div (nelt, 2);
5397               indices.new_vector (sel, 2, nelt);
5398               if (can_vec_perm_const_p (mode, indices))
5399                 return true;
5400             }
5401         }
5402     }
5403
5404   if (dump_enabled_p ())
5405     dump_printf (MSG_MISSED_OPTIMIZATION,
5406                  "permutation op not supported by target.\n");
5407   return false;
5408 }
5409
5410
5411 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5412    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5413
5414 bool
5415 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5416                             bool masked_p)
5417 {
5418   if (masked_p)
5419     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5420                                          vec_mask_store_lanes_optab,
5421                                          vectype, count);
5422   else
5423     return vect_lanes_optab_supported_p ("vec_store_lanes",
5424                                          vec_store_lanes_optab,
5425                                          vectype, count);
5426 }
5427
5428
5429 /* Function vect_permute_store_chain.
5430
5431    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5432    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5433    the data correctly for the stores.  Return the final references for stores
5434    in RESULT_CHAIN.
5435
5436    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5437    The input is 4 vectors each containing 8 elements.  We assign a number to
5438    each element, the input sequence is:
5439
5440    1st vec:   0  1  2  3  4  5  6  7
5441    2nd vec:   8  9 10 11 12 13 14 15
5442    3rd vec:  16 17 18 19 20 21 22 23
5443    4th vec:  24 25 26 27 28 29 30 31
5444
5445    The output sequence should be:
5446
5447    1st vec:  0  8 16 24  1  9 17 25
5448    2nd vec:  2 10 18 26  3 11 19 27
5449    3rd vec:  4 12 20 28  5 13 21 30
5450    4th vec:  6 14 22 30  7 15 23 31
5451
5452    i.e., we interleave the contents of the four vectors in their order.
5453
5454    We use interleave_high/low instructions to create such output.  The input of
5455    each interleave_high/low operation is two vectors:
5456    1st vec    2nd vec
5457    0 1 2 3    4 5 6 7
5458    the even elements of the result vector are obtained left-to-right from the
5459    high/low elements of the first vector.  The odd elements of the result are
5460    obtained left-to-right from the high/low elements of the second vector.
5461    The output of interleave_high will be:   0 4 1 5
5462    and of interleave_low:                   2 6 3 7
5463
5464
5465    The permutation is done in log LENGTH stages.  In each stage interleave_high
5466    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5467    where the first argument is taken from the first half of DR_CHAIN and the
5468    second argument from it's second half.
5469    In our example,
5470
5471    I1: interleave_high (1st vec, 3rd vec)
5472    I2: interleave_low (1st vec, 3rd vec)
5473    I3: interleave_high (2nd vec, 4th vec)
5474    I4: interleave_low (2nd vec, 4th vec)
5475
5476    The output for the first stage is:
5477
5478    I1:  0 16  1 17  2 18  3 19
5479    I2:  4 20  5 21  6 22  7 23
5480    I3:  8 24  9 25 10 26 11 27
5481    I4: 12 28 13 29 14 30 15 31
5482
5483    The output of the second stage, i.e. the final result is:
5484
5485    I1:  0  8 16 24  1  9 17 25
5486    I2:  2 10 18 26  3 11 19 27
5487    I3:  4 12 20 28  5 13 21 30
5488    I4:  6 14 22 30  7 15 23 31.  */
5489
5490 void
5491 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5492                           unsigned int length,
5493                           stmt_vec_info stmt_info,
5494                           gimple_stmt_iterator *gsi,
5495                           vec<tree> *result_chain)
5496 {
5497   tree vect1, vect2, high, low;
5498   gimple *perm_stmt;
5499   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5500   tree perm_mask_low, perm_mask_high;
5501   tree data_ref;
5502   tree perm3_mask_low, perm3_mask_high;
5503   unsigned int i, j, n, log_length = exact_log2 (length);
5504
5505   result_chain->quick_grow (length);
5506   memcpy (result_chain->address (), dr_chain.address (),
5507           length * sizeof (tree));
5508
5509   if (length == 3)
5510     {
5511       /* vect_grouped_store_supported ensures that this is constant.  */
5512       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5513       unsigned int j0 = 0, j1 = 0, j2 = 0;
5514
5515       vec_perm_builder sel (nelt, nelt, 1);
5516       sel.quick_grow (nelt);
5517       vec_perm_indices indices;
5518       for (j = 0; j < 3; j++)
5519         {
5520           int nelt0 = ((3 - j) * nelt) % 3;
5521           int nelt1 = ((3 - j) * nelt + 1) % 3;
5522           int nelt2 = ((3 - j) * nelt + 2) % 3;
5523
5524           for (i = 0; i < nelt; i++)
5525             {
5526               if (3 * i + nelt0 < nelt)
5527                 sel[3 * i + nelt0] = j0++;
5528               if (3 * i + nelt1 < nelt)
5529                 sel[3 * i + nelt1] = nelt + j1++;
5530               if (3 * i + nelt2 < nelt)
5531                 sel[3 * i + nelt2] = 0;
5532             }
5533           indices.new_vector (sel, 2, nelt);
5534           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5535
5536           for (i = 0; i < nelt; i++)
5537             {
5538               if (3 * i + nelt0 < nelt)
5539                 sel[3 * i + nelt0] = 3 * i + nelt0;
5540               if (3 * i + nelt1 < nelt)
5541                 sel[3 * i + nelt1] = 3 * i + nelt1;
5542               if (3 * i + nelt2 < nelt)
5543                 sel[3 * i + nelt2] = nelt + j2++;
5544             }
5545           indices.new_vector (sel, 2, nelt);
5546           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5547
5548           vect1 = dr_chain[0];
5549           vect2 = dr_chain[1];
5550
5551           /* Create interleaving stmt:
5552              low = VEC_PERM_EXPR <vect1, vect2,
5553                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5554                                    j + 2, nelt + j + 2, *, ...}>  */
5555           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5556           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5557                                            vect2, perm3_mask_low);
5558           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5559
5560           vect1 = data_ref;
5561           vect2 = dr_chain[2];
5562           /* Create interleaving stmt:
5563              low = VEC_PERM_EXPR <vect1, vect2,
5564                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5565                                    6, 7, nelt + j + 2, ...}>  */
5566           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5567           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5568                                            vect2, perm3_mask_high);
5569           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5570           (*result_chain)[j] = data_ref;
5571         }
5572     }
5573   else
5574     {
5575       /* If length is not equal to 3 then only power of 2 is supported.  */
5576       gcc_assert (pow2p_hwi (length));
5577
5578       /* The encoding has 2 interleaved stepped patterns.  */
5579       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5580       vec_perm_builder sel (nelt, 2, 3);
5581       sel.quick_grow (6);
5582       for (i = 0; i < 3; i++)
5583         {
5584           sel[i * 2] = i;
5585           sel[i * 2 + 1] = i + nelt;
5586         }
5587         vec_perm_indices indices (sel, 2, nelt);
5588         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5589
5590         for (i = 0; i < 6; i++)
5591           sel[i] += exact_div (nelt, 2);
5592         indices.new_vector (sel, 2, nelt);
5593         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5594
5595         for (i = 0, n = log_length; i < n; i++)
5596           {
5597             for (j = 0; j < length/2; j++)
5598               {
5599                 vect1 = dr_chain[j];
5600                 vect2 = dr_chain[j+length/2];
5601
5602                 /* Create interleaving stmt:
5603                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5604                                                         ...}>  */
5605                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5606                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5607                                                  vect2, perm_mask_high);
5608                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5609                 (*result_chain)[2*j] = high;
5610
5611                 /* Create interleaving stmt:
5612                    low = VEC_PERM_EXPR <vect1, vect2,
5613                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5614                                          ...}>  */
5615                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5616                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5617                                                  vect2, perm_mask_low);
5618                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5619                 (*result_chain)[2*j+1] = low;
5620               }
5621             memcpy (dr_chain.address (), result_chain->address (),
5622                     length * sizeof (tree));
5623           }
5624     }
5625 }
5626
5627 /* Function vect_setup_realignment
5628
5629    This function is called when vectorizing an unaligned load using
5630    the dr_explicit_realign[_optimized] scheme.
5631    This function generates the following code at the loop prolog:
5632
5633       p = initial_addr;
5634    x  msq_init = *(floor(p));   # prolog load
5635       realignment_token = call target_builtin;
5636     loop:
5637    x  msq = phi (msq_init, ---)
5638
5639    The stmts marked with x are generated only for the case of
5640    dr_explicit_realign_optimized.
5641
5642    The code above sets up a new (vector) pointer, pointing to the first
5643    location accessed by STMT_INFO, and a "floor-aligned" load using that
5644    pointer.  It also generates code to compute the "realignment-token"
5645    (if the relevant target hook was defined), and creates a phi-node at the
5646    loop-header bb whose arguments are the result of the prolog-load (created
5647    by this function) and the result of a load that takes place in the loop
5648    (to be created by the caller to this function).
5649
5650    For the case of dr_explicit_realign_optimized:
5651    The caller to this function uses the phi-result (msq) to create the
5652    realignment code inside the loop, and sets up the missing phi argument,
5653    as follows:
5654     loop:
5655       msq = phi (msq_init, lsq)
5656       lsq = *(floor(p'));        # load in loop
5657       result = realign_load (msq, lsq, realignment_token);
5658
5659    For the case of dr_explicit_realign:
5660     loop:
5661       msq = *(floor(p));        # load in loop
5662       p' = p + (VS-1);
5663       lsq = *(floor(p'));       # load in loop
5664       result = realign_load (msq, lsq, realignment_token);
5665
5666    Input:
5667    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5668                a memory location that may be unaligned.
5669    BSI - place where new code is to be inserted.
5670    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5671                               is used.
5672
5673    Output:
5674    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5675                        target hook, if defined.
5676    Return value - the result of the loop-header phi node.  */
5677
5678 tree
5679 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5680                         gimple_stmt_iterator *gsi, tree *realignment_token,
5681                         enum dr_alignment_support alignment_support_scheme,
5682                         tree init_addr,
5683                         class loop **at_loop)
5684 {
5685   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5686   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5687   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5688   struct data_reference *dr = dr_info->dr;
5689   class loop *loop = NULL;
5690   edge pe = NULL;
5691   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5692   tree vec_dest;
5693   gimple *inc;
5694   tree ptr;
5695   tree data_ref;
5696   basic_block new_bb;
5697   tree msq_init = NULL_TREE;
5698   tree new_temp;
5699   gphi *phi_stmt;
5700   tree msq = NULL_TREE;
5701   gimple_seq stmts = NULL;
5702   bool compute_in_loop = false;
5703   bool nested_in_vect_loop = false;
5704   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5705   class loop *loop_for_initial_load = NULL;
5706
5707   if (loop_vinfo)
5708     {
5709       loop = LOOP_VINFO_LOOP (loop_vinfo);
5710       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5711     }
5712
5713   gcc_assert (alignment_support_scheme == dr_explicit_realign
5714               || alignment_support_scheme == dr_explicit_realign_optimized);
5715
5716   /* We need to generate three things:
5717      1. the misalignment computation
5718      2. the extra vector load (for the optimized realignment scheme).
5719      3. the phi node for the two vectors from which the realignment is
5720       done (for the optimized realignment scheme).  */
5721
5722   /* 1. Determine where to generate the misalignment computation.
5723
5724      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5725      calculation will be generated by this function, outside the loop (in the
5726      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5727      caller, inside the loop.
5728
5729      Background: If the misalignment remains fixed throughout the iterations of
5730      the loop, then both realignment schemes are applicable, and also the
5731      misalignment computation can be done outside LOOP.  This is because we are
5732      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5733      are a multiple of VS (the Vector Size), and therefore the misalignment in
5734      different vectorized LOOP iterations is always the same.
5735      The problem arises only if the memory access is in an inner-loop nested
5736      inside LOOP, which is now being vectorized using outer-loop vectorization.
5737      This is the only case when the misalignment of the memory access may not
5738      remain fixed throughout the iterations of the inner-loop (as explained in
5739      detail in vect_supportable_dr_alignment).  In this case, not only is the
5740      optimized realignment scheme not applicable, but also the misalignment
5741      computation (and generation of the realignment token that is passed to
5742      REALIGN_LOAD) have to be done inside the loop.
5743
5744      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5745      or not, which in turn determines if the misalignment is computed inside
5746      the inner-loop, or outside LOOP.  */
5747
5748   if (init_addr != NULL_TREE || !loop_vinfo)
5749     {
5750       compute_in_loop = true;
5751       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5752     }
5753
5754
5755   /* 2. Determine where to generate the extra vector load.
5756
5757      For the optimized realignment scheme, instead of generating two vector
5758      loads in each iteration, we generate a single extra vector load in the
5759      preheader of the loop, and in each iteration reuse the result of the
5760      vector load from the previous iteration.  In case the memory access is in
5761      an inner-loop nested inside LOOP, which is now being vectorized using
5762      outer-loop vectorization, we need to determine whether this initial vector
5763      load should be generated at the preheader of the inner-loop, or can be
5764      generated at the preheader of LOOP.  If the memory access has no evolution
5765      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5766      to be generated inside LOOP (in the preheader of the inner-loop).  */
5767
5768   if (nested_in_vect_loop)
5769     {
5770       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5771       bool invariant_in_outerloop =
5772             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5773       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5774     }
5775   else
5776     loop_for_initial_load = loop;
5777   if (at_loop)
5778     *at_loop = loop_for_initial_load;
5779
5780   if (loop_for_initial_load)
5781     pe = loop_preheader_edge (loop_for_initial_load);
5782
5783   /* 3. For the case of the optimized realignment, create the first vector
5784       load at the loop preheader.  */
5785
5786   if (alignment_support_scheme == dr_explicit_realign_optimized)
5787     {
5788       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5789       gassign *new_stmt;
5790
5791       gcc_assert (!compute_in_loop);
5792       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5793       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5794                                       loop_for_initial_load, NULL_TREE,
5795                                       &init_addr, NULL, &inc, true);
5796       if (TREE_CODE (ptr) == SSA_NAME)
5797         new_temp = copy_ssa_name (ptr);
5798       else
5799         new_temp = make_ssa_name (TREE_TYPE (ptr));
5800       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5801       tree type = TREE_TYPE (ptr);
5802       new_stmt = gimple_build_assign
5803                    (new_temp, BIT_AND_EXPR, ptr,
5804                     fold_build2 (MINUS_EXPR, type,
5805                                  build_int_cst (type, 0),
5806                                  build_int_cst (type, align)));
5807       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5808       gcc_assert (!new_bb);
5809       data_ref
5810         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5811                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5812       vect_copy_ref_info (data_ref, DR_REF (dr));
5813       new_stmt = gimple_build_assign (vec_dest, data_ref);
5814       new_temp = make_ssa_name (vec_dest, new_stmt);
5815       gimple_assign_set_lhs (new_stmt, new_temp);
5816       if (pe)
5817         {
5818           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5819           gcc_assert (!new_bb);
5820         }
5821       else
5822          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5823
5824       msq_init = gimple_assign_lhs (new_stmt);
5825     }
5826
5827   /* 4. Create realignment token using a target builtin, if available.
5828       It is done either inside the containing loop, or before LOOP (as
5829       determined above).  */
5830
5831   if (targetm.vectorize.builtin_mask_for_load)
5832     {
5833       gcall *new_stmt;
5834       tree builtin_decl;
5835
5836       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5837       if (!init_addr)
5838         {
5839           /* Generate the INIT_ADDR computation outside LOOP.  */
5840           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5841                                                             stmt_info, &stmts,
5842                                                             NULL_TREE);
5843           if (loop)
5844             {
5845               pe = loop_preheader_edge (loop);
5846               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5847               gcc_assert (!new_bb);
5848             }
5849           else
5850              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5851         }
5852
5853       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5854       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5855       vec_dest =
5856         vect_create_destination_var (scalar_dest,
5857                                      gimple_call_return_type (new_stmt));
5858       new_temp = make_ssa_name (vec_dest, new_stmt);
5859       gimple_call_set_lhs (new_stmt, new_temp);
5860
5861       if (compute_in_loop)
5862         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5863       else
5864         {
5865           /* Generate the misalignment computation outside LOOP.  */
5866           pe = loop_preheader_edge (loop);
5867           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5868           gcc_assert (!new_bb);
5869         }
5870
5871       *realignment_token = gimple_call_lhs (new_stmt);
5872
5873       /* The result of the CALL_EXPR to this builtin is determined from
5874          the value of the parameter and no global variables are touched
5875          which makes the builtin a "const" function.  Requiring the
5876          builtin to have the "const" attribute makes it unnecessary
5877          to call mark_call_clobbered.  */
5878       gcc_assert (TREE_READONLY (builtin_decl));
5879     }
5880
5881   if (alignment_support_scheme == dr_explicit_realign)
5882     return msq;
5883
5884   gcc_assert (!compute_in_loop);
5885   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5886
5887
5888   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5889
5890   pe = loop_preheader_edge (containing_loop);
5891   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5892   msq = make_ssa_name (vec_dest);
5893   phi_stmt = create_phi_node (msq, containing_loop->header);
5894   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5895
5896   return msq;
5897 }
5898
5899
5900 /* Function vect_grouped_load_supported.
5901
5902    COUNT is the size of the load group (the number of statements plus the
5903    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5904    only one statement, with a gap of COUNT - 1.
5905
5906    Returns true if a suitable permute exists.  */
5907
5908 bool
5909 vect_grouped_load_supported (tree vectype, bool single_element_p,
5910                              unsigned HOST_WIDE_INT count)
5911 {
5912   machine_mode mode = TYPE_MODE (vectype);
5913
5914   /* If this is single-element interleaving with an element distance
5915      that leaves unused vector loads around punt - we at least create
5916      very sub-optimal code in that case (and blow up memory,
5917      see PR65518).  */
5918   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5919     {
5920       if (dump_enabled_p ())
5921         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5922                          "single-element interleaving not supported "
5923                          "for not adjacent vector loads\n");
5924       return false;
5925     }
5926
5927   /* vect_permute_load_chain requires the group size to be equal to 3 or
5928      be a power of two.  */
5929   if (count != 3 && exact_log2 (count) == -1)
5930     {
5931       if (dump_enabled_p ())
5932         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5933                          "the size of the group of accesses"
5934                          " is not a power of 2 or not equal to 3\n");
5935       return false;
5936     }
5937
5938   /* Check that the permutation is supported.  */
5939   if (VECTOR_MODE_P (mode))
5940     {
5941       unsigned int i, j;
5942       if (count == 3)
5943         {
5944           unsigned int nelt;
5945           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5946             {
5947               if (dump_enabled_p ())
5948                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5949                                  "cannot handle groups of 3 loads for"
5950                                  " variable-length vectors\n");
5951               return false;
5952             }
5953
5954           vec_perm_builder sel (nelt, nelt, 1);
5955           sel.quick_grow (nelt);
5956           vec_perm_indices indices;
5957           unsigned int k;
5958           for (k = 0; k < 3; k++)
5959             {
5960               for (i = 0; i < nelt; i++)
5961                 if (3 * i + k < 2 * nelt)
5962                   sel[i] = 3 * i + k;
5963                 else
5964                   sel[i] = 0;
5965               indices.new_vector (sel, 2, nelt);
5966               if (!can_vec_perm_const_p (mode, indices))
5967                 {
5968                   if (dump_enabled_p ())
5969                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5970                                      "shuffle of 3 loads is not supported by"
5971                                      " target\n");
5972                   return false;
5973                 }
5974               for (i = 0, j = 0; i < nelt; i++)
5975                 if (3 * i + k < 2 * nelt)
5976                   sel[i] = i;
5977                 else
5978                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5979               indices.new_vector (sel, 2, nelt);
5980               if (!can_vec_perm_const_p (mode, indices))
5981                 {
5982                   if (dump_enabled_p ())
5983                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5984                                      "shuffle of 3 loads is not supported by"
5985                                      " target\n");
5986                   return false;
5987                 }
5988             }
5989           return true;
5990         }
5991       else
5992         {
5993           /* If length is not equal to 3 then only power of 2 is supported.  */
5994           gcc_assert (pow2p_hwi (count));
5995           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5996
5997           /* The encoding has a single stepped pattern.  */
5998           vec_perm_builder sel (nelt, 1, 3);
5999           sel.quick_grow (3);
6000           for (i = 0; i < 3; i++)
6001             sel[i] = i * 2;
6002           vec_perm_indices indices (sel, 2, nelt);
6003           if (can_vec_perm_const_p (mode, indices))
6004             {
6005               for (i = 0; i < 3; i++)
6006                 sel[i] = i * 2 + 1;
6007               indices.new_vector (sel, 2, nelt);
6008               if (can_vec_perm_const_p (mode, indices))
6009                 return true;
6010             }
6011         }
6012     }
6013
6014   if (dump_enabled_p ())
6015     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6016                      "extract even/odd not supported by target\n");
6017   return false;
6018 }
6019
6020 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6021    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6022
6023 bool
6024 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6025                            bool masked_p)
6026 {
6027   if (masked_p)
6028     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6029                                          vec_mask_load_lanes_optab,
6030                                          vectype, count);
6031   else
6032     return vect_lanes_optab_supported_p ("vec_load_lanes",
6033                                          vec_load_lanes_optab,
6034                                          vectype, count);
6035 }
6036
6037 /* Function vect_permute_load_chain.
6038
6039    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6040    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6041    the input data correctly.  Return the final references for loads in
6042    RESULT_CHAIN.
6043
6044    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6045    The input is 4 vectors each containing 8 elements. We assign a number to each
6046    element, the input sequence is:
6047
6048    1st vec:   0  1  2  3  4  5  6  7
6049    2nd vec:   8  9 10 11 12 13 14 15
6050    3rd vec:  16 17 18 19 20 21 22 23
6051    4th vec:  24 25 26 27 28 29 30 31
6052
6053    The output sequence should be:
6054
6055    1st vec:  0 4  8 12 16 20 24 28
6056    2nd vec:  1 5  9 13 17 21 25 29
6057    3rd vec:  2 6 10 14 18 22 26 30
6058    4th vec:  3 7 11 15 19 23 27 31
6059
6060    i.e., the first output vector should contain the first elements of each
6061    interleaving group, etc.
6062
6063    We use extract_even/odd instructions to create such output.  The input of
6064    each extract_even/odd operation is two vectors
6065    1st vec    2nd vec
6066    0 1 2 3    4 5 6 7
6067
6068    and the output is the vector of extracted even/odd elements.  The output of
6069    extract_even will be:   0 2 4 6
6070    and of extract_odd:     1 3 5 7
6071
6072
6073    The permutation is done in log LENGTH stages.  In each stage extract_even
6074    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6075    their order.  In our example,
6076
6077    E1: extract_even (1st vec, 2nd vec)
6078    E2: extract_odd (1st vec, 2nd vec)
6079    E3: extract_even (3rd vec, 4th vec)
6080    E4: extract_odd (3rd vec, 4th vec)
6081
6082    The output for the first stage will be:
6083
6084    E1:  0  2  4  6  8 10 12 14
6085    E2:  1  3  5  7  9 11 13 15
6086    E3: 16 18 20 22 24 26 28 30
6087    E4: 17 19 21 23 25 27 29 31
6088
6089    In order to proceed and create the correct sequence for the next stage (or
6090    for the correct output, if the second stage is the last one, as in our
6091    example), we first put the output of extract_even operation and then the
6092    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6093    The input for the second stage is:
6094
6095    1st vec (E1):  0  2  4  6  8 10 12 14
6096    2nd vec (E3): 16 18 20 22 24 26 28 30
6097    3rd vec (E2):  1  3  5  7  9 11 13 15
6098    4th vec (E4): 17 19 21 23 25 27 29 31
6099
6100    The output of the second stage:
6101
6102    E1: 0 4  8 12 16 20 24 28
6103    E2: 2 6 10 14 18 22 26 30
6104    E3: 1 5  9 13 17 21 25 29
6105    E4: 3 7 11 15 19 23 27 31
6106
6107    And RESULT_CHAIN after reordering:
6108
6109    1st vec (E1):  0 4  8 12 16 20 24 28
6110    2nd vec (E3):  1 5  9 13 17 21 25 29
6111    3rd vec (E2):  2 6 10 14 18 22 26 30
6112    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6113
6114 static void
6115 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6116                          unsigned int length,
6117                          stmt_vec_info stmt_info,
6118                          gimple_stmt_iterator *gsi,
6119                          vec<tree> *result_chain)
6120 {
6121   tree data_ref, first_vect, second_vect;
6122   tree perm_mask_even, perm_mask_odd;
6123   tree perm3_mask_low, perm3_mask_high;
6124   gimple *perm_stmt;
6125   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6126   unsigned int i, j, log_length = exact_log2 (length);
6127
6128   result_chain->quick_grow (length);
6129   memcpy (result_chain->address (), dr_chain.address (),
6130           length * sizeof (tree));
6131
6132   if (length == 3)
6133     {
6134       /* vect_grouped_load_supported ensures that this is constant.  */
6135       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6136       unsigned int k;
6137
6138       vec_perm_builder sel (nelt, nelt, 1);
6139       sel.quick_grow (nelt);
6140       vec_perm_indices indices;
6141       for (k = 0; k < 3; k++)
6142         {
6143           for (i = 0; i < nelt; i++)
6144             if (3 * i + k < 2 * nelt)
6145               sel[i] = 3 * i + k;
6146             else
6147               sel[i] = 0;
6148           indices.new_vector (sel, 2, nelt);
6149           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6150
6151           for (i = 0, j = 0; i < nelt; i++)
6152             if (3 * i + k < 2 * nelt)
6153               sel[i] = i;
6154             else
6155               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6156           indices.new_vector (sel, 2, nelt);
6157           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6158
6159           first_vect = dr_chain[0];
6160           second_vect = dr_chain[1];
6161
6162           /* Create interleaving stmt (low part of):
6163              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6164                                                              ...}>  */
6165           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6166           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6167                                            second_vect, perm3_mask_low);
6168           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6169
6170           /* Create interleaving stmt (high part of):
6171              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6172                                                               ...}>  */
6173           first_vect = data_ref;
6174           second_vect = dr_chain[2];
6175           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6176           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6177                                            second_vect, perm3_mask_high);
6178           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6179           (*result_chain)[k] = data_ref;
6180         }
6181     }
6182   else
6183     {
6184       /* If length is not equal to 3 then only power of 2 is supported.  */
6185       gcc_assert (pow2p_hwi (length));
6186
6187       /* The encoding has a single stepped pattern.  */
6188       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6189       vec_perm_builder sel (nelt, 1, 3);
6190       sel.quick_grow (3);
6191       for (i = 0; i < 3; ++i)
6192         sel[i] = i * 2;
6193       vec_perm_indices indices (sel, 2, nelt);
6194       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6195
6196       for (i = 0; i < 3; ++i)
6197         sel[i] = i * 2 + 1;
6198       indices.new_vector (sel, 2, nelt);
6199       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6200
6201       for (i = 0; i < log_length; i++)
6202         {
6203           for (j = 0; j < length; j += 2)
6204             {
6205               first_vect = dr_chain[j];
6206               second_vect = dr_chain[j+1];
6207
6208               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6209               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6210               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6211                                                first_vect, second_vect,
6212                                                perm_mask_even);
6213               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6214               (*result_chain)[j/2] = data_ref;
6215
6216               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6217               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6218               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6219                                                first_vect, second_vect,
6220                                                perm_mask_odd);
6221               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6222               (*result_chain)[j/2+length/2] = data_ref;
6223             }
6224           memcpy (dr_chain.address (), result_chain->address (),
6225                   length * sizeof (tree));
6226         }
6227     }
6228 }
6229
6230 /* Function vect_shift_permute_load_chain.
6231
6232    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6233    sequence of stmts to reorder the input data accordingly.
6234    Return the final references for loads in RESULT_CHAIN.
6235    Return true if successed, false otherwise.
6236
6237    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6238    The input is 3 vectors each containing 8 elements.  We assign a
6239    number to each element, the input sequence is:
6240
6241    1st vec:   0  1  2  3  4  5  6  7
6242    2nd vec:   8  9 10 11 12 13 14 15
6243    3rd vec:  16 17 18 19 20 21 22 23
6244
6245    The output sequence should be:
6246
6247    1st vec:  0 3 6  9 12 15 18 21
6248    2nd vec:  1 4 7 10 13 16 19 22
6249    3rd vec:  2 5 8 11 14 17 20 23
6250
6251    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6252
6253    First we shuffle all 3 vectors to get correct elements order:
6254
6255    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6256    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6257    3rd vec:  (16 19 22) (17 20 23) (18 21)
6258
6259    Next we unite and shift vector 3 times:
6260
6261    1st step:
6262      shift right by 6 the concatenation of:
6263      "1st vec" and  "2nd vec"
6264        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6265      "2nd vec" and  "3rd vec"
6266        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6267      "3rd vec" and  "1st vec"
6268        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6269                              | New vectors                   |
6270
6271      So that now new vectors are:
6272
6273      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6274      2nd vec:  (10 13) (16 19 22) (17 20 23)
6275      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6276
6277    2nd step:
6278      shift right by 5 the concatenation of:
6279      "1st vec" and  "3rd vec"
6280        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6281      "2nd vec" and  "1st vec"
6282        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6283      "3rd vec" and  "2nd vec"
6284        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6285                           | New vectors                   |
6286
6287      So that now new vectors are:
6288
6289      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6290      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6291      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6292
6293    3rd step:
6294      shift right by 5 the concatenation of:
6295      "1st vec" and  "1st vec"
6296        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6297      shift right by 3 the concatenation of:
6298      "2nd vec" and  "2nd vec"
6299                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6300                           | New vectors                   |
6301
6302      So that now all vectors are READY:
6303      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6304      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6305      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6306
6307    This algorithm is faster than one in vect_permute_load_chain if:
6308      1.  "shift of a concatination" is faster than general permutation.
6309          This is usually so.
6310      2.  The TARGET machine can't execute vector instructions in parallel.
6311          This is because each step of the algorithm depends on previous.
6312          The algorithm in vect_permute_load_chain is much more parallel.
6313
6314    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6315 */
6316
6317 static bool
6318 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6319                                unsigned int length,
6320                                stmt_vec_info stmt_info,
6321                                gimple_stmt_iterator *gsi,
6322                                vec<tree> *result_chain)
6323 {
6324   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6325   tree perm2_mask1, perm2_mask2, perm3_mask;
6326   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6327   gimple *perm_stmt;
6328
6329   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6330   unsigned int i;
6331   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6332
6333   unsigned HOST_WIDE_INT nelt, vf;
6334   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6335       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6336     /* Not supported for variable-length vectors.  */
6337     return false;
6338
6339   vec_perm_builder sel (nelt, nelt, 1);
6340   sel.quick_grow (nelt);
6341
6342   result_chain->quick_grow (length);
6343   memcpy (result_chain->address (), dr_chain.address (),
6344           length * sizeof (tree));
6345
6346   if (pow2p_hwi (length) && vf > 4)
6347     {
6348       unsigned int j, log_length = exact_log2 (length);
6349       for (i = 0; i < nelt / 2; ++i)
6350         sel[i] = i * 2;
6351       for (i = 0; i < nelt / 2; ++i)
6352         sel[nelt / 2 + i] = i * 2 + 1;
6353       vec_perm_indices indices (sel, 2, nelt);
6354       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6355         {
6356           if (dump_enabled_p ())
6357             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6358                              "shuffle of 2 fields structure is not \
6359                               supported by target\n");
6360           return false;
6361         }
6362       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6363
6364       for (i = 0; i < nelt / 2; ++i)
6365         sel[i] = i * 2 + 1;
6366       for (i = 0; i < nelt / 2; ++i)
6367         sel[nelt / 2 + i] = i * 2;
6368       indices.new_vector (sel, 2, nelt);
6369       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6370         {
6371           if (dump_enabled_p ())
6372             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6373                              "shuffle of 2 fields structure is not \
6374                               supported by target\n");
6375           return false;
6376         }
6377       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6378
6379       /* Generating permutation constant to shift all elements.
6380          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6381       for (i = 0; i < nelt; i++)
6382         sel[i] = nelt / 2 + i;
6383       indices.new_vector (sel, 2, nelt);
6384       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6385         {
6386           if (dump_enabled_p ())
6387             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6388                              "shift permutation is not supported by target\n");
6389           return false;
6390         }
6391       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6392
6393       /* Generating permutation constant to select vector from 2.
6394          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6395       for (i = 0; i < nelt / 2; i++)
6396         sel[i] = i;
6397       for (i = nelt / 2; i < nelt; i++)
6398         sel[i] = nelt + i;
6399       indices.new_vector (sel, 2, nelt);
6400       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6401         {
6402           if (dump_enabled_p ())
6403             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6404                              "select is not supported by target\n");
6405           return false;
6406         }
6407       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6408
6409       for (i = 0; i < log_length; i++)
6410         {
6411           for (j = 0; j < length; j += 2)
6412             {
6413               first_vect = dr_chain[j];
6414               second_vect = dr_chain[j + 1];
6415
6416               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6417               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6418                                                first_vect, first_vect,
6419                                                perm2_mask1);
6420               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6421               vect[0] = data_ref;
6422
6423               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6424               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6425                                                second_vect, second_vect,
6426                                                perm2_mask2);
6427               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6428               vect[1] = data_ref;
6429
6430               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6431               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6432                                                vect[0], vect[1], shift1_mask);
6433               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6434               (*result_chain)[j/2 + length/2] = data_ref;
6435
6436               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6437               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6438                                                vect[0], vect[1], select_mask);
6439               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6440               (*result_chain)[j/2] = data_ref;
6441             }
6442           memcpy (dr_chain.address (), result_chain->address (),
6443                   length * sizeof (tree));
6444         }
6445       return true;
6446     }
6447   if (length == 3 && vf > 2)
6448     {
6449       unsigned int k = 0, l = 0;
6450
6451       /* Generating permutation constant to get all elements in rigth order.
6452          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6453       for (i = 0; i < nelt; i++)
6454         {
6455           if (3 * k + (l % 3) >= nelt)
6456             {
6457               k = 0;
6458               l += (3 - (nelt % 3));
6459             }
6460           sel[i] = 3 * k + (l % 3);
6461           k++;
6462         }
6463       vec_perm_indices indices (sel, 2, nelt);
6464       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6465         {
6466           if (dump_enabled_p ())
6467             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6468                              "shuffle of 3 fields structure is not \
6469                               supported by target\n");
6470           return false;
6471         }
6472       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6473
6474       /* Generating permutation constant to shift all elements.
6475          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6476       for (i = 0; i < nelt; i++)
6477         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6478       indices.new_vector (sel, 2, nelt);
6479       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6480         {
6481           if (dump_enabled_p ())
6482             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6483                              "shift permutation is not supported by target\n");
6484           return false;
6485         }
6486       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6487
6488       /* Generating permutation constant to shift all elements.
6489          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6490       for (i = 0; i < nelt; i++)
6491         sel[i] = 2 * (nelt / 3) + 1 + i;
6492       indices.new_vector (sel, 2, nelt);
6493       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6494         {
6495           if (dump_enabled_p ())
6496             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6497                              "shift permutation is not supported by target\n");
6498           return false;
6499         }
6500       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6501
6502       /* Generating permutation constant to shift all elements.
6503          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6504       for (i = 0; i < nelt; i++)
6505         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6506       indices.new_vector (sel, 2, nelt);
6507       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6508         {
6509           if (dump_enabled_p ())
6510             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6511                              "shift permutation is not supported by target\n");
6512           return false;
6513         }
6514       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6515
6516       /* Generating permutation constant to shift all elements.
6517          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6518       for (i = 0; i < nelt; i++)
6519         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6520       indices.new_vector (sel, 2, nelt);
6521       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6522         {
6523           if (dump_enabled_p ())
6524             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6525                              "shift permutation is not supported by target\n");
6526           return false;
6527         }
6528       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6529
6530       for (k = 0; k < 3; k++)
6531         {
6532           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6533           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6534                                            dr_chain[k], dr_chain[k],
6535                                            perm3_mask);
6536           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6537           vect[k] = data_ref;
6538         }
6539
6540       for (k = 0; k < 3; k++)
6541         {
6542           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6543           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6544                                            vect[k % 3], vect[(k + 1) % 3],
6545                                            shift1_mask);
6546           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6547           vect_shift[k] = data_ref;
6548         }
6549
6550       for (k = 0; k < 3; k++)
6551         {
6552           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6553           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6554                                            vect_shift[(4 - k) % 3],
6555                                            vect_shift[(3 - k) % 3],
6556                                            shift2_mask);
6557           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6558           vect[k] = data_ref;
6559         }
6560
6561       (*result_chain)[3 - (nelt % 3)] = vect[2];
6562
6563       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6564       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6565                                        vect[0], shift3_mask);
6566       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6567       (*result_chain)[nelt % 3] = data_ref;
6568
6569       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6570       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6571                                        vect[1], shift4_mask);
6572       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6573       (*result_chain)[0] = data_ref;
6574       return true;
6575     }
6576   return false;
6577 }
6578
6579 /* Function vect_transform_grouped_load.
6580
6581    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6582    to perform their permutation and ascribe the result vectorized statements to
6583    the scalar statements.
6584 */
6585
6586 void
6587 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6588                              vec<tree> dr_chain,
6589                              int size, gimple_stmt_iterator *gsi)
6590 {
6591   machine_mode mode;
6592   vec<tree> result_chain = vNULL;
6593
6594   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6595      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6596      vectors, that are ready for vector computation.  */
6597   result_chain.create (size);
6598
6599   /* If reassociation width for vector type is 2 or greater target machine can
6600      execute 2 or more vector instructions in parallel.  Otherwise try to
6601      get chain for loads group using vect_shift_permute_load_chain.  */
6602   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6603   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6604       || pow2p_hwi (size)
6605       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6606                                          gsi, &result_chain))
6607     vect_permute_load_chain (vinfo, dr_chain,
6608                              size, stmt_info, gsi, &result_chain);
6609   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6610   result_chain.release ();
6611 }
6612
6613 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6614    generated as part of the vectorization of STMT_INFO.  Assign the statement
6615    for each vector to the associated scalar statement.  */
6616
6617 void
6618 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6619                                   vec<tree> result_chain)
6620 {
6621   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6622   unsigned int i, gap_count;
6623   tree tmp_data_ref;
6624
6625   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6626      Since we scan the chain starting from it's first node, their order
6627      corresponds the order of data-refs in RESULT_CHAIN.  */
6628   stmt_vec_info next_stmt_info = first_stmt_info;
6629   gap_count = 1;
6630   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6631     {
6632       if (!next_stmt_info)
6633         break;
6634
6635       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6636        code elimination pass later.  No need to check for the first stmt in
6637        the group, since it always exists.
6638        DR_GROUP_GAP is the number of steps in elements from the previous
6639        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6640        correspond to the gaps.  */
6641       if (next_stmt_info != first_stmt_info
6642           && gap_count < DR_GROUP_GAP (next_stmt_info))
6643         {
6644           gap_count++;
6645           continue;
6646         }
6647
6648       /* ???  The following needs cleanup after the removal of
6649          DR_GROUP_SAME_DR_STMT.  */
6650       if (next_stmt_info)
6651         {
6652           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6653           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6654              copies, and we put the new vector statement last.  */
6655           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6656
6657           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6658           gap_count = 1;
6659         }
6660     }
6661 }
6662
6663 /* Function vect_force_dr_alignment_p.
6664
6665    Returns whether the alignment of a DECL can be forced to be aligned
6666    on ALIGNMENT bit boundary.  */
6667
6668 bool
6669 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6670 {
6671   if (!VAR_P (decl))
6672     return false;
6673
6674   if (decl_in_symtab_p (decl)
6675       && !symtab_node::get (decl)->can_increase_alignment_p ())
6676     return false;
6677
6678   if (TREE_STATIC (decl))
6679     return (known_le (alignment,
6680                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6681   else
6682     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6683 }
6684
6685 /* Return whether the data reference DR_INFO is supported with respect to its
6686    alignment.
6687    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6688    it is aligned, i.e., check if it is possible to vectorize it with different
6689    alignment.  */
6690
6691 enum dr_alignment_support
6692 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6693                                tree vectype, int misalignment)
6694 {
6695   data_reference *dr = dr_info->dr;
6696   stmt_vec_info stmt_info = dr_info->stmt;
6697   machine_mode mode = TYPE_MODE (vectype);
6698   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6699   class loop *vect_loop = NULL;
6700   bool nested_in_vect_loop = false;
6701
6702   if (misalignment == 0)
6703     return dr_aligned;
6704
6705   /* For now assume all conditional loads/stores support unaligned
6706      access without any special code.  */
6707   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6708     if (gimple_call_internal_p (stmt)
6709         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6710             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6711       return dr_unaligned_supported;
6712
6713   if (loop_vinfo)
6714     {
6715       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6716       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6717     }
6718
6719   /* Possibly unaligned access.  */
6720
6721   /* We can choose between using the implicit realignment scheme (generating
6722      a misaligned_move stmt) and the explicit realignment scheme (generating
6723      aligned loads with a REALIGN_LOAD).  There are two variants to the
6724      explicit realignment scheme: optimized, and unoptimized.
6725      We can optimize the realignment only if the step between consecutive
6726      vector loads is equal to the vector size.  Since the vector memory
6727      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6728      is guaranteed that the misalignment amount remains the same throughout the
6729      execution of the vectorized loop.  Therefore, we can create the
6730      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6731      at the loop preheader.
6732
6733      However, in the case of outer-loop vectorization, when vectorizing a
6734      memory access in the inner-loop nested within the LOOP that is now being
6735      vectorized, while it is guaranteed that the misalignment of the
6736      vectorized memory access will remain the same in different outer-loop
6737      iterations, it is *not* guaranteed that is will remain the same throughout
6738      the execution of the inner-loop.  This is because the inner-loop advances
6739      with the original scalar step (and not in steps of VS).  If the inner-loop
6740      step happens to be a multiple of VS, then the misalignment remains fixed
6741      and we can use the optimized realignment scheme.  For example:
6742
6743       for (i=0; i<N; i++)
6744         for (j=0; j<M; j++)
6745           s += a[i+j];
6746
6747      When vectorizing the i-loop in the above example, the step between
6748      consecutive vector loads is 1, and so the misalignment does not remain
6749      fixed across the execution of the inner-loop, and the realignment cannot
6750      be optimized (as illustrated in the following pseudo vectorized loop):
6751
6752       for (i=0; i<N; i+=4)
6753         for (j=0; j<M; j++){
6754           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6755                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6756                          // (assuming that we start from an aligned address).
6757           }
6758
6759      We therefore have to use the unoptimized realignment scheme:
6760
6761       for (i=0; i<N; i+=4)
6762           for (j=k; j<M; j+=4)
6763           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6764                            // that the misalignment of the initial address is
6765                            // 0).
6766
6767      The loop can then be vectorized as follows:
6768
6769       for (k=0; k<4; k++){
6770         rt = get_realignment_token (&vp[k]);
6771         for (i=0; i<N; i+=4){
6772           v1 = vp[i+k];
6773           for (j=k; j<M; j+=4){
6774             v2 = vp[i+j+VS-1];
6775             va = REALIGN_LOAD <v1,v2,rt>;
6776             vs += va;
6777             v1 = v2;
6778           }
6779         }
6780     } */
6781
6782   if (DR_IS_READ (dr))
6783     {
6784       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6785           && (!targetm.vectorize.builtin_mask_for_load
6786               || targetm.vectorize.builtin_mask_for_load ()))
6787         {
6788           /* If we are doing SLP then the accesses need not have the
6789              same alignment, instead it depends on the SLP group size.  */
6790           if (loop_vinfo
6791               && STMT_SLP_TYPE (stmt_info)
6792               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6793                               * (DR_GROUP_SIZE
6794                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6795                               TYPE_VECTOR_SUBPARTS (vectype)))
6796             ;
6797           else if (!loop_vinfo
6798                    || (nested_in_vect_loop
6799                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6800                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6801             return dr_explicit_realign;
6802           else
6803             return dr_explicit_realign_optimized;
6804         }
6805     }
6806
6807   bool is_packed = false;
6808   tree type = TREE_TYPE (DR_REF (dr));
6809   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6810     is_packed = not_size_aligned (DR_REF (dr));
6811   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6812                                                      is_packed))
6813     return dr_unaligned_supported;
6814
6815   /* Unsupported.  */
6816   return dr_unaligned_unsupported;
6817 }