gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 120 {
 121   HOST_WIDE_INT lhs, rhs;
 122
 123   /* During the analysis phase, this function is called on arbitrary
 124      statements that might not have scalar results.  */
 125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 126     return scalar_type;
 127
 128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 129
 130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 131   if (assign)
 132     {
 133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 134       if (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
 140           || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
 141           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 142         {
 143           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 144
 145           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 146           if (rhs < lhs)
 147             scalar_type = rhs_type;
 148         }
 149     }
 150   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 151     {
 152       unsigned int i = 0;
 153       if (gimple_call_internal_p (call))
 154         {
 155           internal_fn ifn = gimple_call_internal_fn (call);
 156           if (internal_load_fn_p (ifn))
 157             /* For loads the LHS type does the trick.  */
 158             i = ~0U;
 159           else if (internal_store_fn_p (ifn))
 160             {
 161               /* For stores use the tyep of the stored value.  */
 162               i = internal_fn_stored_value_index (ifn);
 163               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 164               i = ~0U;
 165             }
 166           else if (internal_fn_mask_index (ifn) == 0)
 167             i = 1;
 168         }
 169       if (i < gimple_call_num_args (call))
 170         {
 171           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 172           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 173             {
 174               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 175               if (rhs < lhs)
 176                 scalar_type = rhs_type;
 177             }
 178         }
 179     }
 180
 181   return scalar_type;
 182 }
 183
 184
 185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 186    tested at run-time.  Return TRUE if DDR was successfully inserted.
 187    Return false if versioning is not supported.  */
 188
 189 static opt_result
 190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 191 {
 192   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 193
 194   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 195     return opt_result::failure_at (vect_location,
 196                                    "will not create alias checks, as"
 197                                    " --param vect-max-version-for-alias-checks"
 198                                    " == 0\n");
 199
 200   opt_result res
 201     = runtime_alias_check_p (ddr, loop,
 202                              optimize_loop_nest_for_speed_p (loop));
 203   if (!res)
 204     return res;
 205
 206   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 207   return opt_result::success ();
 208 }
 209
 210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 211
 212 static void
 213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 214 {
 215   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 216   for (unsigned int i = 0; i < checks.length(); ++i)
 217     if (checks[i] == value)
 218       return;
 219
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location,
 222                      "need run-time check that %T is nonzero\n",
 223                      value);
 224   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 225 }
 226
 227 /* Return true if we know that the order of vectorized DR_INFO_A and
 228    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 229    DR_INFO_B.  At least one of the accesses is a write.  */
 230
 231 static bool
 232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 233 {
 234   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 235   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 236
 237   /* Single statements are always kept in their original order.  */
 238   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 239       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 240     return true;
 241
 242   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 243      emitted at the position of the first scalar load.
 244      Stores in a group are emitted at the position of the last scalar store.
 245      Compute that position and check whether the resulting order matches
 246      the current one.  */
 247   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 248   if (il_a)
 249     {
 250       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 251         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 252              s = DR_GROUP_NEXT_ELEMENT (s))
 253           il_a = get_later_stmt (il_a, s);
 254       else /* DR_IS_READ */
 255         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 256              s = DR_GROUP_NEXT_ELEMENT (s))
 257           if (get_later_stmt (il_a, s) == il_a)
 258             il_a = s;
 259     }
 260   else
 261     il_a = stmtinfo_a;
 262   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 263   if (il_b)
 264     {
 265       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 266         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 267              s = DR_GROUP_NEXT_ELEMENT (s))
 268           il_b = get_later_stmt (il_b, s);
 269       else /* DR_IS_READ */
 270         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 271              s = DR_GROUP_NEXT_ELEMENT (s))
 272           if (get_later_stmt (il_b, s) == il_b)
 273             il_b = s;
 274     }
 275   else
 276     il_b = stmtinfo_b;
 277   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 278   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 279 }
 280
 281 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 282    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 283    distances.  These distances are conservatively correct but they don't
 284    reflect a guaranteed dependence.
 285
 286    Return true if this function does all the work necessary to avoid
 287    an alias or false if the caller should use the dependence distances
 288    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 289    the depth of the loop described by LOOP_VINFO and the other arguments
 290    are as for vect_analyze_data_ref_dependence.  */
 291
 292 static bool
 293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 294                                        loop_vec_info loop_vinfo,
 295                                        int loop_depth, unsigned int *max_vf)
 296 {
 297   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 298   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 299     {
 300       int dist = dist_v[loop_depth];
 301       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 302         {
 303           /* If the user asserted safelen >= DIST consecutive iterations
 304              can be executed concurrently, assume independence.
 305
 306              ??? An alternative would be to add the alias check even
 307              in this case, and vectorize the fallback loop with the
 308              maximum VF set to safelen.  However, if the user has
 309              explicitly given a length, it's less likely that that
 310              would be a win.  */
 311           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 312             {
 313               if ((unsigned int) loop->safelen < *max_vf)
 314                 *max_vf = loop->safelen;
 315               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 316               continue;
 317             }
 318
 319           /* For dependence distances of 2 or more, we have the option
 320              of limiting VF or checking for an alias at runtime.
 321              Prefer to check at runtime if we can, to avoid limiting
 322              the VF unnecessarily when the bases are in fact independent.
 323
 324              Note that the alias checks will be removed if the VF ends up
 325              being small enough.  */
 326           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 327           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 328           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 329                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 330                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 331         }
 332     }
 333   return true;
 334 }
 335
 336
 337 /* Function vect_analyze_data_ref_dependence.
 338
 339    FIXME: I needed to change the sense of the returned flag.
 340
 341    Return FALSE if there (might) exist a dependence between a memory-reference
 342    DRA and a memory-reference DRB.  When versioning for alias may check a
 343    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 344    the data dependence.  */
 345
 346 static opt_result
 347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 348                                   loop_vec_info loop_vinfo,
 349                                   unsigned int *max_vf)
 350 {
 351   unsigned int i;
 352   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 353   struct data_reference *dra = DDR_A (ddr);
 354   struct data_reference *drb = DDR_B (ddr);
 355   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 356   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 357   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 358   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 359   lambda_vector dist_v;
 360   unsigned int loop_depth;
 361
 362   /* If user asserted safelen consecutive iterations can be
 363      executed concurrently, assume independence.  */
 364   auto apply_safelen = [&]()
 365     {
 366       if (loop->safelen >= 2)
 367         {
 368           if ((unsigned int) loop->safelen < *max_vf)
 369             *max_vf = loop->safelen;
 370           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 371           return true;
 372         }
 373       return false;
 374     };
 375
 376   /* In loop analysis all data references should be vectorizable.  */
 377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 379     gcc_unreachable ();
 380
 381   /* Independent data accesses.  */
 382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 383     return opt_result::success ();
 384
 385   if (dra == drb
 386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 387     return opt_result::success ();
 388
 389   /* We do not have to consider dependences between accesses that belong
 390      to the same group, unless the stride could be smaller than the
 391      group size.  */
 392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 394           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 396     return opt_result::success ();
 397
 398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 399      least two scalar iterations, there is always also a true dependence.
 400      As the vectorizer does not re-order loads and stores we can ignore
 401      the anti-dependence if TBAA can disambiguate both DRs similar to the
 402      case with known negative distance anti-dependences (positive
 403      distance anti-dependences would violate TBAA constraints).  */
 404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 407                                  get_alias_set (DR_REF (drb))))
 408     return opt_result::success ();
 409
 410   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 411       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 412     {
 413       if (apply_safelen ())
 414         return opt_result::success ();
 415
 416       return opt_result::failure_at
 417         (stmtinfo_a->stmt,
 418          "possible alias involving gather/scatter between %T and %T\n",
 419          DR_REF (dra), DR_REF (drb));
 420     }
 421
 422   /* Unknown data dependence.  */
 423   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 424     {
 425       if (apply_safelen ())
 426         return opt_result::success ();
 427
 428       if (dump_enabled_p ())
 429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 430                          "versioning for alias required: "
 431                          "can't determine dependence between %T and %T\n",
 432                          DR_REF (dra), DR_REF (drb));
 433
 434       /* Add to list of ddrs that need to be tested at run-time.  */
 435       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 436     }
 437
 438   /* Known data dependence.  */
 439   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 440     {
 441       if (apply_safelen ())
 442         return opt_result::success ();
 443
 444       if (dump_enabled_p ())
 445         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 446                          "versioning for alias required: "
 447                          "bad dist vector for %T and %T\n",
 448                          DR_REF (dra), DR_REF (drb));
 449       /* Add to list of ddrs that need to be tested at run-time.  */
 450       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 451     }
 452
 453   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 454
 455   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 456       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 457                                                 loop_depth, max_vf))
 458     return opt_result::success ();
 459
 460   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 461     {
 462       int dist = dist_v[loop_depth];
 463
 464       if (dump_enabled_p ())
 465         dump_printf_loc (MSG_NOTE, vect_location,
 466                          "dependence distance  = %d.\n", dist);
 467
 468       if (dist == 0)
 469         {
 470           if (dump_enabled_p ())
 471             dump_printf_loc (MSG_NOTE, vect_location,
 472                              "dependence distance == 0 between %T and %T\n",
 473                              DR_REF (dra), DR_REF (drb));
 474
 475           /* When we perform grouped accesses and perform implicit CSE
 476              by detecting equal accesses and doing disambiguation with
 477              runtime alias tests like for
 478                 .. = a[i];
 479                 .. = a[i+1];
 480                 a[i] = ..;
 481                 a[i+1] = ..;
 482                 *p = ..;
 483                 .. = a[i];
 484                 .. = a[i+1];
 485              where we will end up loading { a[i], a[i+1] } once, make
 486              sure that inserting group loads before the first load and
 487              stores after the last store will do the right thing.
 488              Similar for groups like
 489                 a[i] = ...;
 490                 ... = a[i];
 491                 a[i+1] = ...;
 492              where loads from the group interleave with the store.  */
 493           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 494             return opt_result::failure_at (stmtinfo_a->stmt,
 495                                            "READ_WRITE dependence"
 496                                            " in interleaving.\n");
 497
 498           if (loop->safelen < 2)
 499             {
 500               tree indicator = dr_zero_step_indicator (dra);
 501               if (!indicator || integer_zerop (indicator))
 502                 return opt_result::failure_at (stmtinfo_a->stmt,
 503                                                "access also has a zero step\n");
 504               else if (TREE_CODE (indicator) != INTEGER_CST)
 505                 vect_check_nonzero_value (loop_vinfo, indicator);
 506             }
 507           continue;
 508         }
 509
 510       if (dist > 0 && DDR_REVERSED_P (ddr))
 511         {
 512           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 513              reversed (to make distance vector positive), and the actual
 514              distance is negative.  */
 515           if (dump_enabled_p ())
 516             dump_printf_loc (MSG_NOTE, vect_location,
 517                              "dependence distance negative.\n");
 518           /* When doing outer loop vectorization, we need to check if there is
 519              a backward dependence at the inner loop level if the dependence
 520              at the outer loop is reversed.  See PR81740.  */
 521           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 522               || nested_in_vect_loop_p (loop, stmtinfo_b))
 523             {
 524               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 525                                                          DDR_LOOP_NEST (ddr));
 526               if (dist_v[inner_depth] < 0)
 527                 return opt_result::failure_at (stmtinfo_a->stmt,
 528                                                "not vectorized, dependence "
 529                                                "between data-refs %T and %T\n",
 530                                                DR_REF (dra), DR_REF (drb));
 531             }
 532           /* Record a negative dependence distance to later limit the
 533              amount of stmt copying / unrolling we can perform.
 534              Only need to handle read-after-write dependence.  */
 535           if (DR_IS_READ (drb)
 536               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 537                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 538             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 539           continue;
 540         }
 541
 542       unsigned int abs_dist = abs (dist);
 543       if (abs_dist >= 2 && abs_dist < *max_vf)
 544         {
 545           /* The dependence distance requires reduction of the maximal
 546              vectorization factor.  */
 547           *max_vf = abs_dist;
 548           if (dump_enabled_p ())
 549             dump_printf_loc (MSG_NOTE, vect_location,
 550                              "adjusting maximal vectorization factor to %i\n",
 551                              *max_vf);
 552         }
 553
 554       if (abs_dist >= *max_vf)
 555         {
 556           /* Dependence distance does not create dependence, as far as
 557              vectorization is concerned, in this case.  */
 558           if (dump_enabled_p ())
 559             dump_printf_loc (MSG_NOTE, vect_location,
 560                              "dependence distance >= VF.\n");
 561           continue;
 562         }
 563
 564       return opt_result::failure_at (stmtinfo_a->stmt,
 565                                      "not vectorized, possible dependence "
 566                                      "between data-refs %T and %T\n",
 567                                      DR_REF (dra), DR_REF (drb));
 568     }
 569
 570   return opt_result::success ();
 571 }
 572
 573 /* Function vect_analyze_data_ref_dependences.
 574
 575    Examine all the data references in the loop, and make sure there do not
 576    exist any data dependences between them.  Set *MAX_VF according to
 577    the maximum vectorization factor the data dependences allow.  */
 578
 579 opt_result
 580 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 581                                    unsigned int *max_vf)
 582 {
 583   unsigned int i;
 584   struct data_dependence_relation *ddr;
 585
 586   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 587
 588   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 589     {
 590       LOOP_VINFO_DDRS (loop_vinfo)
 591         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 592                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 593       /* We do not need read-read dependences.  */
 594       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 595                                           &LOOP_VINFO_DDRS (loop_vinfo),
 596                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 597                                           false);
 598       gcc_assert (res);
 599     }
 600
 601   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 602
 603   /* For epilogues we either have no aliases or alias versioning
 604      was applied to original loop.  Therefore we may just get max_vf
 605      using VF of original loop.  */
 606   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 607     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 608   else
 609     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 610       {
 611         opt_result res
 612           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 613         if (!res)
 614           return res;
 615       }
 616
 617   return opt_result::success ();
 618 }
 619
 620
 621 /* Function vect_slp_analyze_data_ref_dependence.
 622
 623    Return TRUE if there (might) exist a dependence between a memory-reference
 624    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 625    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 626    according to the data dependence.  */
 627
 628 static bool
 629 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 630                                       struct data_dependence_relation *ddr)
 631 {
 632   struct data_reference *dra = DDR_A (ddr);
 633   struct data_reference *drb = DDR_B (ddr);
 634   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 635   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 636
 637   /* We need to check dependences of statements marked as unvectorizable
 638      as well, they still can prohibit vectorization.  */
 639
 640   /* Independent data accesses.  */
 641   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 642     return false;
 643
 644   if (dra == drb)
 645     return false;
 646
 647   /* Read-read is OK.  */
 648   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 649     return false;
 650
 651   /* If dra and drb are part of the same interleaving chain consider
 652      them independent.  */
 653   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 654       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 655           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 656     return false;
 657
 658   /* Unknown data dependence.  */
 659   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 660     {
 661       if  (dump_enabled_p ())
 662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 663                          "can't determine dependence between %T and %T\n",
 664                          DR_REF (dra), DR_REF (drb));
 665     }
 666   else if (dump_enabled_p ())
 667     dump_printf_loc (MSG_NOTE, vect_location,
 668                      "determined dependence between %T and %T\n",
 669                      DR_REF (dra), DR_REF (drb));
 670
 671   return true;
 672 }
 673
 674
 675 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 676    contain the vector of scalar stores of this instance if we are
 677    disambiguating the loads.  */
 678
 679 static bool
 680 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 681                                    vec<stmt_vec_info> stores,
 682                                    stmt_vec_info last_store_info)
 683 {
 684   /* This walks over all stmts involved in the SLP load/store done
 685      in NODE verifying we can sink them up to the last stmt in the
 686      group.  */
 687   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
 688     {
 689       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 690       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 691         {
 692           stmt_vec_info access_info
 693             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 694           if (access_info == last_access_info)
 695             continue;
 696           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 697           ao_ref ref;
 698           bool ref_initialized_p = false;
 699           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 700                gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 701             {
 702               gimple *stmt = gsi_stmt (gsi);
 703               if (! gimple_vuse (stmt))
 704                 continue;
 705
 706               /* If we couldn't record a (single) data reference for this
 707                  stmt we have to resort to the alias oracle.  */
 708               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 709               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 710               if (!dr_b)
 711                 {
 712                   /* We are moving a store - this means
 713                      we cannot use TBAA for disambiguation.  */
 714                   if (!ref_initialized_p)
 715                     ao_ref_init (&ref, DR_REF (dr_a));
 716                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 717                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 718                     return false;
 719                   continue;
 720                 }
 721
 722               bool dependent = false;
 723               /* If we run into a store of this same instance (we've just
 724                  marked those) then delay dependence checking until we run
 725                  into the last store because this is where it will have
 726                  been sunk to (and we verify if we can do that as well).  */
 727               if (gimple_visited_p (stmt))
 728                 {
 729                   if (stmt_info != last_store_info)
 730                     continue;
 731
 732                   for (stmt_vec_info &store_info : stores)
 733                     {
 734                       data_reference *store_dr
 735                         = STMT_VINFO_DATA_REF (store_info);
 736                       ddr_p ddr = initialize_data_dependence_relation
 737                                     (dr_a, store_dr, vNULL);
 738                       dependent
 739                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 740                       free_dependence_relation (ddr);
 741                       if (dependent)
 742                         break;
 743                     }
 744                 }
 745               else
 746                 {
 747                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 748                                                                    dr_b, vNULL);
 749                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 750                   free_dependence_relation (ddr);
 751                 }
 752               if (dependent)
 753                 return false;
 754             }
 755         }
 756     }
 757   else /* DR_IS_READ */
 758     {
 759       stmt_vec_info first_access_info
 760         = vect_find_first_scalar_stmt_in_slp (node);
 761       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 762         {
 763           stmt_vec_info access_info
 764             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 765           if (access_info == first_access_info)
 766             continue;
 767           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 768           ao_ref ref;
 769           bool ref_initialized_p = false;
 770           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 771                gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 772             {
 773               gimple *stmt = gsi_stmt (gsi);
 774               if (! gimple_vdef (stmt))
 775                 continue;
 776
 777               /* If we couldn't record a (single) data reference for this
 778                  stmt we have to resort to the alias oracle.  */
 779               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 780               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 781
 782               /* We are hoisting a load - this means we can use
 783                  TBAA for disambiguation.  */
 784               if (!ref_initialized_p)
 785                 ao_ref_init (&ref, DR_REF (dr_a));
 786               if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
 787                 {
 788                   if (!dr_b)
 789                     return false;
 790                   /* Resort to dependence checking below.  */
 791                 }
 792               else
 793                 /* No dependence.  */
 794                 continue;
 795
 796               bool dependent = false;
 797               /* If we run into a store of this same instance (we've just
 798                  marked those) then delay dependence checking until we run
 799                  into the last store because this is where it will have
 800                  been sunk to (and we verify if we can do that as well).  */
 801               if (gimple_visited_p (stmt))
 802                 {
 803                   if (stmt_info != last_store_info)
 804                     continue;
 805
 806                   for (stmt_vec_info &store_info : stores)
 807                     {
 808                       data_reference *store_dr
 809                         = STMT_VINFO_DATA_REF (store_info);
 810                       ddr_p ddr = initialize_data_dependence_relation
 811                                     (dr_a, store_dr, vNULL);
 812                       dependent
 813                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 814                       free_dependence_relation (ddr);
 815                       if (dependent)
 816                         break;
 817                     }
 818                 }
 819               else
 820                 {
 821                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 822                                                                    dr_b, vNULL);
 823                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 824                   free_dependence_relation (ddr);
 825                 }
 826               if (dependent)
 827                 return false;
 828             }
 829         }
 830     }
 831   return true;
 832 }
 833
 834
 835 /* Function vect_analyze_data_ref_dependences.
 836
 837    Examine all the data references in the basic-block, and make sure there
 838    do not exist any data dependences between them.  Set *MAX_VF according to
 839    the maximum vectorization factor the data dependences allow.  */
 840
 841 bool
 842 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 843 {
 844   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 845
 846   /* The stores of this instance are at the root of the SLP tree.  */
 847   slp_tree store = NULL;
 848   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
 849     store = SLP_INSTANCE_TREE (instance);
 850
 851   /* Verify we can sink stores to the vectorized stmt insert location.  */
 852   stmt_vec_info last_store_info = NULL;
 853   if (store)
 854     {
 855       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 856         return false;
 857
 858       /* Mark stores in this instance and remember the last one.  */
 859       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 860       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 861         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 862     }
 863
 864   bool res = true;
 865
 866   /* Verify we can sink loads to the vectorized stmt insert location,
 867      special-casing stores of this instance.  */
 868   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
 869     if (! vect_slp_analyze_node_dependences (vinfo, load,
 870                                              store
 871                                              ? SLP_TREE_SCALAR_STMTS (store)
 872                                              : vNULL, last_store_info))
 873       {
 874         res = false;
 875         break;
 876       }
 877
 878   /* Unset the visited flag.  */
 879   if (store)
 880     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 881       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 882
 883   return res;
 884 }
 885
 886 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
 887    applied.  */
 888
 889 int
 890 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
 891 {
 892   HOST_WIDE_INT diff = 0;
 893   /* Alignment is only analyzed for the first element of a DR group,
 894      use that but adjust misalignment by the offset of the access.  */
 895   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
 896     {
 897       dr_vec_info *first_dr
 898         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
 899       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
 900          INTEGER_CSTs and the first element in the group has the lowest
 901          address.  */
 902       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
 903               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
 904       gcc_assert (diff >= 0);
 905       dr_info = first_dr;
 906     }
 907
 908   int misalign = dr_info->misalignment;
 909   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
 910   if (misalign == DR_MISALIGNMENT_UNKNOWN)
 911     return misalign;
 912
 913   /* If the access is only aligned for a vector type with smaller alignment
 914      requirement the access has unknown misalignment.  */
 915   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
 916                 targetm.vectorize.preferred_vector_alignment (vectype)))
 917     return DR_MISALIGNMENT_UNKNOWN;
 918
 919   /* Apply the offset from the DR group start and the externally supplied
 920      offset which can for example result from a negative stride access.  */
 921   poly_int64 misalignment = misalign + diff + offset;
 922
 923   /* vect_compute_data_ref_alignment will have ensured that target_alignment
 924      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
 925   unsigned HOST_WIDE_INT target_alignment_c
 926     = dr_info->target_alignment.to_constant ();
 927   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
 928     return DR_MISALIGNMENT_UNKNOWN;
 929   return misalign;
 930 }
 931
 932 /* Record the base alignment guarantee given by DRB, which occurs
 933    in STMT_INFO.  */
 934
 935 static void
 936 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 937                             innermost_loop_behavior *drb)
 938 {
 939   bool existed;
 940   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
 941     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 942   if (!existed || entry.second->base_alignment < drb->base_alignment)
 943     {
 944       entry = std::make_pair (stmt_info, drb);
 945       if (dump_enabled_p ())
 946         dump_printf_loc (MSG_NOTE, vect_location,
 947                          "recording new base alignment for %T\n"
 948                          "  alignment:    %d\n"
 949                          "  misalignment: %d\n"
 950                          "  based on:     %G",
 951                          drb->base_address,
 952                          drb->base_alignment,
 953                          drb->base_misalignment,
 954                          stmt_info->stmt);
 955     }
 956 }
 957
 958 /* If the region we're going to vectorize is reached, all unconditional
 959    data references occur at least once.  We can therefore pool the base
 960    alignment guarantees from each unconditional reference.  Do this by
 961    going through all the data references in VINFO and checking whether
 962    the containing statement makes the reference unconditionally.  If so,
 963    record the alignment of the base address in VINFO so that it can be
 964    used for all other references with the same base.  */
 965
 966 void
 967 vect_record_base_alignments (vec_info *vinfo)
 968 {
 969   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 970   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 971   for (data_reference *dr : vinfo->shared->datarefs)
 972     {
 973       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 974       stmt_vec_info stmt_info = dr_info->stmt;
 975       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 976           && STMT_VINFO_VECTORIZABLE (stmt_info)
 977           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 978         {
 979           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 980
 981           /* If DR is nested in the loop that is being vectorized, we can also
 982              record the alignment of the base wrt the outer loop.  */
 983           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 984             vect_record_base_alignment
 985               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 986         }
 987     }
 988 }
 989
 990 /* Function vect_compute_data_ref_alignment
 991
 992    Compute the misalignment of the data reference DR_INFO when vectorizing
 993    with VECTYPE.
 994
 995    Output:
 996    1. initialized misalignment info for DR_INFO
 997
 998    FOR NOW: No analysis is actually performed. Misalignment is calculated
 999    only for trivial cases. TODO.  */
1000
1001 static void
1002 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1003                                  tree vectype)
1004 {
1005   stmt_vec_info stmt_info = dr_info->stmt;
1006   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1007   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1008   class loop *loop = NULL;
1009   tree ref = DR_REF (dr_info->dr);
1010
1011   if (dump_enabled_p ())
1012     dump_printf_loc (MSG_NOTE, vect_location,
1013                      "vect_compute_data_ref_alignment:\n");
1014
1015   if (loop_vinfo)
1016     loop = LOOP_VINFO_LOOP (loop_vinfo);
1017
1018   /* Initialize misalignment to unknown.  */
1019   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1020
1021   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1022     return;
1023
1024   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1025   bool step_preserves_misalignment_p;
1026
1027   poly_uint64 vector_alignment
1028     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1029                  BITS_PER_UNIT);
1030   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1031
1032   /* If the main loop has peeled for alignment we have no way of knowing
1033      whether the data accesses in the epilogues are aligned.  We can't at
1034      compile time answer the question whether we have entered the main loop or
1035      not.  Fixes PR 92351.  */
1036   if (loop_vinfo)
1037     {
1038       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1039       if (orig_loop_vinfo
1040           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1041         return;
1042     }
1043
1044   unsigned HOST_WIDE_INT vect_align_c;
1045   if (!vector_alignment.is_constant (&vect_align_c))
1046     return;
1047
1048   /* No step for BB vectorization.  */
1049   if (!loop)
1050     {
1051       gcc_assert (integer_zerop (drb->step));
1052       step_preserves_misalignment_p = true;
1053     }
1054
1055   /* In case the dataref is in an inner-loop of the loop that is being
1056      vectorized (LOOP), we use the base and misalignment information
1057      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1058      stays the same throughout the execution of the inner-loop, which is why
1059      we have to check that the stride of the dataref in the inner-loop evenly
1060      divides by the vector alignment.  */
1061   else if (nested_in_vect_loop_p (loop, stmt_info))
1062     {
1063       step_preserves_misalignment_p
1064         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1065
1066       if (dump_enabled_p ())
1067         {
1068           if (step_preserves_misalignment_p)
1069             dump_printf_loc (MSG_NOTE, vect_location,
1070                              "inner step divides the vector alignment.\n");
1071           else
1072             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1073                              "inner step doesn't divide the vector"
1074                              " alignment.\n");
1075         }
1076     }
1077
1078   /* Similarly we can only use base and misalignment information relative to
1079      an innermost loop if the misalignment stays the same throughout the
1080      execution of the loop.  As above, this is the case if the stride of
1081      the dataref evenly divides by the alignment.  */
1082   else
1083     {
1084       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1085       step_preserves_misalignment_p
1086         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1087
1088       if (!step_preserves_misalignment_p && dump_enabled_p ())
1089         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1090                          "step doesn't divide the vector alignment.\n");
1091     }
1092
1093   unsigned int base_alignment = drb->base_alignment;
1094   unsigned int base_misalignment = drb->base_misalignment;
1095
1096   /* Calculate the maximum of the pooled base address alignment and the
1097      alignment that we can compute for DR itself.  */
1098   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1099     = base_alignments->get (drb->base_address);
1100   if (entry
1101       && base_alignment < (*entry).second->base_alignment
1102       && (loop_vinfo
1103           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1104                               gimple_bb (entry->first->stmt))
1105               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1106                   || (entry->first->dr_aux.group <= dr_info->group)))))
1107     {
1108       base_alignment = entry->second->base_alignment;
1109       base_misalignment = entry->second->base_misalignment;
1110     }
1111
1112   if (drb->offset_alignment < vect_align_c
1113       || !step_preserves_misalignment_p
1114       /* We need to know whether the step wrt the vectorized loop is
1115          negative when computing the starting misalignment below.  */
1116       || TREE_CODE (drb->step) != INTEGER_CST)
1117     {
1118       if (dump_enabled_p ())
1119         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1120                          "Unknown alignment for access: %T\n", ref);
1121       return;
1122     }
1123
1124   if (base_alignment < vect_align_c)
1125     {
1126       unsigned int max_alignment;
1127       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1128       if (max_alignment < vect_align_c
1129           || !vect_can_force_dr_alignment_p (base,
1130                                              vect_align_c * BITS_PER_UNIT))
1131         {
1132           if (dump_enabled_p ())
1133             dump_printf_loc (MSG_NOTE, vect_location,
1134                              "can't force alignment of ref: %T\n", ref);
1135           return;
1136         }
1137
1138       /* Force the alignment of the decl.
1139          NOTE: This is the only change to the code we make during
1140          the analysis phase, before deciding to vectorize the loop.  */
1141       if (dump_enabled_p ())
1142         dump_printf_loc (MSG_NOTE, vect_location,
1143                          "force alignment of %T\n", ref);
1144
1145       dr_info->base_decl = base;
1146       dr_info->base_misaligned = true;
1147       base_misalignment = 0;
1148     }
1149   poly_int64 misalignment
1150     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1151
1152   unsigned int const_misalignment;
1153   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1154     {
1155       if (dump_enabled_p ())
1156         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157                          "Non-constant misalignment for access: %T\n", ref);
1158       return;
1159     }
1160
1161   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1162
1163   if (dump_enabled_p ())
1164     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165                      "misalign = %d bytes of ref %T\n",
1166                      const_misalignment, ref);
1167
1168   return;
1169 }
1170
1171 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1172    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1173    is made aligned via peeling.  */
1174
1175 static bool
1176 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1177                                          dr_vec_info *dr_peel_info)
1178 {
1179   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1180                   DR_TARGET_ALIGNMENT (dr_info)))
1181     {
1182       poly_offset_int diff
1183         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1184            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1185       if (known_eq (diff, 0)
1186           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1187         return true;
1188     }
1189   return false;
1190 }
1191
1192 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1193    aligned via peeling.  */
1194
1195 static bool
1196 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1197                                  dr_vec_info *dr_peel_info)
1198 {
1199   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1200                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1201       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1202                            DR_OFFSET (dr_peel_info->dr), 0)
1203       || !operand_equal_p (DR_STEP (dr_info->dr),
1204                            DR_STEP (dr_peel_info->dr), 0))
1205     return false;
1206
1207   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1208 }
1209
1210 /* Compute the value for dr_info->misalign so that the access appears
1211    aligned.  This is used by peeling to compensate for dr_misalignment
1212    applying the offset for negative step.  */
1213
1214 int
1215 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1216 {
1217   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1218     return 0;
1219
1220   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1221   poly_int64 misalignment
1222     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1223        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1224
1225   unsigned HOST_WIDE_INT target_alignment_c;
1226   int misalign;
1227   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1228       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1229     return DR_MISALIGNMENT_UNKNOWN;
1230   return misalign;
1231 }
1232
1233 /* Function vect_update_misalignment_for_peel.
1234    Sets DR_INFO's misalignment
1235    - to 0 if it has the same alignment as DR_PEEL_INFO,
1236    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1237    - to -1 (unknown) otherwise.
1238
1239    DR_INFO - the data reference whose misalignment is to be adjusted.
1240    DR_PEEL_INFO - the data reference whose misalignment is being made
1241                   zero in the vector loop by the peel.
1242    NPEEL - the number of iterations in the peel loop if the misalignment
1243            of DR_PEEL_INFO is known at compile time.  */
1244
1245 static void
1246 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1247                                    dr_vec_info *dr_peel_info, int npeel)
1248 {
1249   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1250   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1251     {
1252       SET_DR_MISALIGNMENT (dr_info,
1253                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1254       return;
1255     }
1256
1257   unsigned HOST_WIDE_INT alignment;
1258   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1259       && known_alignment_for_access_p (dr_info,
1260                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1261       && known_alignment_for_access_p (dr_peel_info,
1262                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1263     {
1264       int misal = dr_info->misalignment;
1265       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1266       misal &= alignment - 1;
1267       set_dr_misalignment (dr_info, misal);
1268       return;
1269     }
1270
1271   if (dump_enabled_p ())
1272     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1273                      "to unknown (-1).\n");
1274   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1275 }
1276
1277 /* Return true if alignment is relevant for DR_INFO.  */
1278
1279 static bool
1280 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1281 {
1282   stmt_vec_info stmt_info = dr_info->stmt;
1283
1284   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1285     return false;
1286
1287   /* For interleaving, only the alignment of the first access matters.  */
1288   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1289       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1290     return false;
1291
1292   /* Scatter-gather and invariant accesses continue to address individual
1293      scalars, so vector-level alignment is irrelevant.  */
1294   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1295       || integer_zerop (DR_STEP (dr_info->dr)))
1296     return false;
1297
1298   /* Strided accesses perform only component accesses, alignment is
1299      irrelevant for them.  */
1300   if (STMT_VINFO_STRIDED_P (stmt_info)
1301       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1302     return false;
1303
1304   return true;
1305 }
1306
1307 /* Given an memory reference EXP return whether its alignment is less
1308    than its size.  */
1309
1310 static bool
1311 not_size_aligned (tree exp)
1312 {
1313   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1314     return true;
1315
1316   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1317           > get_object_alignment (exp));
1318 }
1319
1320 /* Function vector_alignment_reachable_p
1321
1322    Return true if vector alignment for DR_INFO is reachable by peeling
1323    a few loop iterations.  Return false otherwise.  */
1324
1325 static bool
1326 vector_alignment_reachable_p (dr_vec_info *dr_info)
1327 {
1328   stmt_vec_info stmt_info = dr_info->stmt;
1329   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1330
1331   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1332     {
1333       /* For interleaved access we peel only if number of iterations in
1334          the prolog loop ({VF - misalignment}), is a multiple of the
1335          number of the interleaved accesses.  */
1336       int elem_size, mis_in_elements;
1337
1338       /* FORNOW: handle only known alignment.  */
1339       if (!known_alignment_for_access_p (dr_info, vectype))
1340         return false;
1341
1342       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1343       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1344       elem_size = vector_element_size (vector_size, nelements);
1345       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1346
1347       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1348         return false;
1349     }
1350
1351   /* If misalignment is known at the compile time then allow peeling
1352      only if natural alignment is reachable through peeling.  */
1353   if (known_alignment_for_access_p (dr_info, vectype)
1354       && !aligned_access_p (dr_info, vectype))
1355     {
1356       HOST_WIDE_INT elmsize =
1357                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1358       if (dump_enabled_p ())
1359         {
1360           dump_printf_loc (MSG_NOTE, vect_location,
1361                            "data size = %wd. misalignment = %d.\n", elmsize,
1362                            dr_misalignment (dr_info, vectype));
1363         }
1364       if (dr_misalignment (dr_info, vectype) % elmsize)
1365         {
1366           if (dump_enabled_p ())
1367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1368                              "data size does not divide the misalignment.\n");
1369           return false;
1370         }
1371     }
1372
1373   if (!known_alignment_for_access_p (dr_info, vectype))
1374     {
1375       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1376       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1377       if (dump_enabled_p ())
1378         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1379                          "Unknown misalignment, %snaturally aligned\n",
1380                          is_packed ? "not " : "");
1381       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1382     }
1383
1384   return true;
1385 }
1386
1387
1388 /* Calculate the cost of the memory access represented by DR_INFO.  */
1389
1390 static void
1391 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1392                            dr_alignment_support alignment_support_scheme,
1393                            int misalignment,
1394                            unsigned int *inside_cost,
1395                            unsigned int *outside_cost,
1396                            stmt_vector_for_cost *body_cost_vec,
1397                            stmt_vector_for_cost *prologue_cost_vec)
1398 {
1399   stmt_vec_info stmt_info = dr_info->stmt;
1400   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1401   int ncopies;
1402
1403   if (PURE_SLP_STMT (stmt_info))
1404     ncopies = 1;
1405   else
1406     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1407
1408   if (DR_IS_READ (dr_info->dr))
1409     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1410                         misalignment, true, inside_cost,
1411                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1412   else
1413     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1414                          misalignment, inside_cost, body_cost_vec);
1415
1416   if (dump_enabled_p ())
1417     dump_printf_loc (MSG_NOTE, vect_location,
1418                      "vect_get_data_access_cost: inside_cost = %d, "
1419                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1420 }
1421
1422
1423 typedef struct _vect_peel_info
1424 {
1425   dr_vec_info *dr_info;
1426   int npeel;
1427   unsigned int count;
1428 } *vect_peel_info;
1429
1430 typedef struct _vect_peel_extended_info
1431 {
1432   vec_info *vinfo;
1433   struct _vect_peel_info peel_info;
1434   unsigned int inside_cost;
1435   unsigned int outside_cost;
1436 } *vect_peel_extended_info;
1437
1438
1439 /* Peeling hashtable helpers.  */
1440
1441 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1442 {
1443   static inline hashval_t hash (const _vect_peel_info *);
1444   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1445 };
1446
1447 inline hashval_t
1448 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1449 {
1450   return (hashval_t) peel_info->npeel;
1451 }
1452
1453 inline bool
1454 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1455 {
1456   return (a->npeel == b->npeel);
1457 }
1458
1459
1460 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1461
1462 static void
1463 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1464                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1465                           int npeel, bool supportable_if_not_aligned)
1466 {
1467   struct _vect_peel_info elem, *slot;
1468   _vect_peel_info **new_slot;
1469
1470   elem.npeel = npeel;
1471   slot = peeling_htab->find (&elem);
1472   if (slot)
1473     slot->count++;
1474   else
1475     {
1476       slot = XNEW (struct _vect_peel_info);
1477       slot->npeel = npeel;
1478       slot->dr_info = dr_info;
1479       slot->count = 1;
1480       new_slot = peeling_htab->find_slot (slot, INSERT);
1481       *new_slot = slot;
1482     }
1483
1484   /* If this DR is not supported with unknown misalignment then bias
1485      this slot when the cost model is disabled.  */
1486   if (!supportable_if_not_aligned
1487       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1488     slot->count += VECT_MAX_COST;
1489 }
1490
1491
1492 /* Traverse peeling hash table to find peeling option that aligns maximum
1493    number of data accesses.  */
1494
1495 int
1496 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1497                                      _vect_peel_extended_info *max)
1498 {
1499   vect_peel_info elem = *slot;
1500
1501   if (elem->count > max->peel_info.count
1502       || (elem->count == max->peel_info.count
1503           && max->peel_info.npeel > elem->npeel))
1504     {
1505       max->peel_info.npeel = elem->npeel;
1506       max->peel_info.count = elem->count;
1507       max->peel_info.dr_info = elem->dr_info;
1508     }
1509
1510   return 1;
1511 }
1512
1513 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1514    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1515    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1516    after peeling.  */
1517
1518 static void
1519 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1520                                 dr_vec_info *dr0_info,
1521                                 unsigned int *inside_cost,
1522                                 unsigned int *outside_cost,
1523                                 stmt_vector_for_cost *body_cost_vec,
1524                                 stmt_vector_for_cost *prologue_cost_vec,
1525                                 unsigned int npeel)
1526 {
1527   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1528
1529   bool dr0_alignment_known_p
1530     = (dr0_info
1531        && known_alignment_for_access_p (dr0_info,
1532                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1533
1534   for (data_reference *dr : datarefs)
1535     {
1536       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1537       if (!vect_relevant_for_alignment_p (dr_info))
1538         continue;
1539
1540       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1541       dr_alignment_support alignment_support_scheme;
1542       int misalignment;
1543       unsigned HOST_WIDE_INT alignment;
1544
1545       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1546                                             size_zero_node) < 0;
1547       poly_int64 off = 0;
1548       if (negative)
1549         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1550                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1551
1552       if (npeel == 0)
1553         misalignment = dr_misalignment (dr_info, vectype, off);
1554       else if (dr_info == dr0_info
1555                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1556         misalignment = 0;
1557       else if (!dr0_alignment_known_p
1558                || !known_alignment_for_access_p (dr_info, vectype)
1559                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1560         misalignment = DR_MISALIGNMENT_UNKNOWN;
1561       else
1562         {
1563           misalignment = dr_misalignment (dr_info, vectype, off);
1564           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1565           misalignment &= alignment - 1;
1566         }
1567       alignment_support_scheme
1568         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1569                                          misalignment);
1570
1571       vect_get_data_access_cost (loop_vinfo, dr_info,
1572                                  alignment_support_scheme, misalignment,
1573                                  inside_cost, outside_cost,
1574                                  body_cost_vec, prologue_cost_vec);
1575     }
1576 }
1577
1578 /* Traverse peeling hash table and calculate cost for each peeling option.
1579    Find the one with the lowest cost.  */
1580
1581 int
1582 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1583                                    _vect_peel_extended_info *min)
1584 {
1585   vect_peel_info elem = *slot;
1586   int dummy;
1587   unsigned int inside_cost = 0, outside_cost = 0;
1588   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1589   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1590                        epilogue_cost_vec;
1591
1592   prologue_cost_vec.create (2);
1593   body_cost_vec.create (2);
1594   epilogue_cost_vec.create (2);
1595
1596   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1597                                   &outside_cost, &body_cost_vec,
1598                                   &prologue_cost_vec, elem->npeel);
1599
1600   body_cost_vec.release ();
1601
1602   outside_cost += vect_get_known_peeling_cost
1603     (loop_vinfo, elem->npeel, &dummy,
1604      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1605      &prologue_cost_vec, &epilogue_cost_vec);
1606
1607   /* Prologue and epilogue costs are added to the target model later.
1608      These costs depend only on the scalar iteration cost, the
1609      number of peeling iterations finally chosen, and the number of
1610      misaligned statements.  So discard the information found here.  */
1611   prologue_cost_vec.release ();
1612   epilogue_cost_vec.release ();
1613
1614   if (inside_cost < min->inside_cost
1615       || (inside_cost == min->inside_cost
1616           && outside_cost < min->outside_cost))
1617     {
1618       min->inside_cost = inside_cost;
1619       min->outside_cost = outside_cost;
1620       min->peel_info.dr_info = elem->dr_info;
1621       min->peel_info.npeel = elem->npeel;
1622       min->peel_info.count = elem->count;
1623     }
1624
1625   return 1;
1626 }
1627
1628
1629 /* Choose best peeling option by traversing peeling hash table and either
1630    choosing an option with the lowest cost (if cost model is enabled) or the
1631    option that aligns as many accesses as possible.  */
1632
1633 static struct _vect_peel_extended_info
1634 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1635                                        loop_vec_info loop_vinfo)
1636 {
1637    struct _vect_peel_extended_info res;
1638
1639    res.peel_info.dr_info = NULL;
1640    res.vinfo = loop_vinfo;
1641
1642    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1643      {
1644        res.inside_cost = INT_MAX;
1645        res.outside_cost = INT_MAX;
1646        peeling_htab->traverse <_vect_peel_extended_info *,
1647                                vect_peeling_hash_get_lowest_cost> (&res);
1648      }
1649    else
1650      {
1651        res.peel_info.count = 0;
1652        peeling_htab->traverse <_vect_peel_extended_info *,
1653                                vect_peeling_hash_get_most_frequent> (&res);
1654        res.inside_cost = 0;
1655        res.outside_cost = 0;
1656      }
1657
1658    return res;
1659 }
1660
1661 /* Return true if the new peeling NPEEL is supported.  */
1662
1663 static bool
1664 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1665                           unsigned npeel)
1666 {
1667   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1668   enum dr_alignment_support supportable_dr_alignment;
1669
1670   bool dr0_alignment_known_p
1671     = known_alignment_for_access_p (dr0_info,
1672                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1673
1674   /* Ensure that all data refs can be vectorized after the peel.  */
1675   for (data_reference *dr : datarefs)
1676     {
1677       if (dr == dr0_info->dr)
1678         continue;
1679
1680       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1681       if (!vect_relevant_for_alignment_p (dr_info)
1682           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1683         continue;
1684
1685       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1686       int misalignment;
1687       unsigned HOST_WIDE_INT alignment;
1688       if (!dr0_alignment_known_p
1689           || !known_alignment_for_access_p (dr_info, vectype)
1690           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1691         misalignment = DR_MISALIGNMENT_UNKNOWN;
1692       else
1693         {
1694           misalignment = dr_misalignment (dr_info, vectype);
1695           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1696           misalignment &= alignment - 1;
1697         }
1698       supportable_dr_alignment
1699         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1700                                          misalignment);
1701       if (supportable_dr_alignment == dr_unaligned_unsupported)
1702         return false;
1703     }
1704
1705   return true;
1706 }
1707
1708 /* Compare two data-references DRA and DRB to group them into chunks
1709    with related alignment.  */
1710
1711 static int
1712 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1713 {
1714   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1715   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1716   int cmp;
1717
1718   /* Stabilize sort.  */
1719   if (dra == drb)
1720     return 0;
1721
1722   /* Ordering of DRs according to base.  */
1723   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1724                                DR_BASE_ADDRESS (drb));
1725   if (cmp != 0)
1726     return cmp;
1727
1728   /* And according to DR_OFFSET.  */
1729   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1730   if (cmp != 0)
1731     return cmp;
1732
1733   /* And after step.  */
1734   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1735   if (cmp != 0)
1736     return cmp;
1737
1738   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1739   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1740   if (cmp == 0)
1741     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1742   return cmp;
1743 }
1744
1745 /* Function vect_enhance_data_refs_alignment
1746
1747    This pass will use loop versioning and loop peeling in order to enhance
1748    the alignment of data references in the loop.
1749
1750    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1751    original loop is to be vectorized.  Any other loops that are created by
1752    the transformations performed in this pass - are not supposed to be
1753    vectorized.  This restriction will be relaxed.
1754
1755    This pass will require a cost model to guide it whether to apply peeling
1756    or versioning or a combination of the two.  For example, the scheme that
1757    intel uses when given a loop with several memory accesses, is as follows:
1758    choose one memory access ('p') which alignment you want to force by doing
1759    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1760    other accesses are not necessarily aligned, or (2) use loop versioning to
1761    generate one loop in which all accesses are aligned, and another loop in
1762    which only 'p' is necessarily aligned.
1763
1764    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1765    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1766    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1767
1768    Devising a cost model is the most critical aspect of this work.  It will
1769    guide us on which access to peel for, whether to use loop versioning, how
1770    many versions to create, etc.  The cost model will probably consist of
1771    generic considerations as well as target specific considerations (on
1772    powerpc for example, misaligned stores are more painful than misaligned
1773    loads).
1774
1775    Here are the general steps involved in alignment enhancements:
1776
1777      -- original loop, before alignment analysis:
1778         for (i=0; i<N; i++){
1779           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1780           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1781         }
1782
1783      -- After vect_compute_data_refs_alignment:
1784         for (i=0; i<N; i++){
1785           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1786           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1787         }
1788
1789      -- Possibility 1: we do loop versioning:
1790      if (p is aligned) {
1791         for (i=0; i<N; i++){    # loop 1A
1792           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1793           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1794         }
1795      }
1796      else {
1797         for (i=0; i<N; i++){    # loop 1B
1798           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1799           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1800         }
1801      }
1802
1803      -- Possibility 2: we do loop peeling:
1804      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1805         x = q[i];
1806         p[i] = y;
1807      }
1808      for (i = 3; i < N; i++){   # loop 2A
1809         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1810         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1811      }
1812
1813      -- Possibility 3: combination of loop peeling and versioning:
1814      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1815         x = q[i];
1816         p[i] = y;
1817      }
1818      if (p is aligned) {
1819         for (i = 3; i<N; i++){  # loop 3A
1820           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1821           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1822         }
1823      }
1824      else {
1825         for (i = 3; i<N; i++){  # loop 3B
1826           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1827           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1828         }
1829      }
1830
1831      These loops are later passed to loop_transform to be vectorized.  The
1832      vectorizer will use the alignment information to guide the transformation
1833      (whether to generate regular loads/stores, or with special handling for
1834      misalignment).  */
1835
1836 opt_result
1837 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1838 {
1839   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1840   dr_vec_info *first_store = NULL;
1841   dr_vec_info *dr0_info = NULL;
1842   struct data_reference *dr;
1843   unsigned int i;
1844   bool do_peeling = false;
1845   bool do_versioning = false;
1846   unsigned int npeel = 0;
1847   bool one_misalignment_known = false;
1848   bool one_misalignment_unknown = false;
1849   bool one_dr_unsupportable = false;
1850   dr_vec_info *unsupportable_dr_info = NULL;
1851   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1852   hash_table<peel_info_hasher> peeling_htab (1);
1853
1854   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1855
1856   /* Reset data so we can safely be called multiple times.  */
1857   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1858   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1859
1860   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1861     return opt_result::success ();
1862
1863   /* Sort the vector of datarefs so DRs that have the same or dependent
1864      alignment are next to each other.  */
1865   auto_vec<data_reference_p> datarefs
1866     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1867   datarefs.qsort (dr_align_group_sort_cmp);
1868
1869   /* Compute the number of DRs that become aligned when we peel
1870      a dataref so it becomes aligned.  */
1871   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1872   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1873   unsigned i0;
1874   for (i0 = 0; i0 < datarefs.length (); ++i0)
1875     if (DR_BASE_ADDRESS (datarefs[i0]))
1876       break;
1877   for (i = i0 + 1; i <= datarefs.length (); ++i)
1878     {
1879       if (i == datarefs.length ()
1880           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1881                                DR_BASE_ADDRESS (datarefs[i]), 0)
1882           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1883                                DR_OFFSET (datarefs[i]), 0)
1884           || !operand_equal_p (DR_STEP (datarefs[i0]),
1885                                DR_STEP (datarefs[i]), 0))
1886         {
1887           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1888              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1889              will get known misalignment if we align one of the refs
1890              with the largest DR_TARGET_ALIGNMENT.  */
1891           for (unsigned j = i0; j < i; ++j)
1892             {
1893               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1894               for (unsigned k = i0; k < i; ++k)
1895                 {
1896                   if (k == j)
1897                     continue;
1898                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1899                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1900                                                                dr_infoj))
1901                     n_same_align_refs[j]++;
1902                 }
1903             }
1904           i0 = i;
1905         }
1906     }
1907
1908   /* While cost model enhancements are expected in the future, the high level
1909      view of the code at this time is as follows:
1910
1911      A) If there is a misaligned access then see if peeling to align
1912         this access can make all data references satisfy
1913         vect_supportable_dr_alignment.  If so, update data structures
1914         as needed and return true.
1915
1916      B) If peeling wasn't possible and there is a data reference with an
1917         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1918         then see if loop versioning checks can be used to make all data
1919         references satisfy vect_supportable_dr_alignment.  If so, update
1920         data structures as needed and return true.
1921
1922      C) If neither peeling nor versioning were successful then return false if
1923         any data reference does not satisfy vect_supportable_dr_alignment.
1924
1925      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1926
1927      Note, Possibility 3 above (which is peeling and versioning together) is not
1928      being done at this time.  */
1929
1930   /* (1) Peeling to force alignment.  */
1931
1932   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1933      Considerations:
1934      + How many accesses will become aligned due to the peeling
1935      - How many accesses will become unaligned due to the peeling,
1936        and the cost of misaligned accesses.
1937      - The cost of peeling (the extra runtime checks, the increase
1938        in code size).  */
1939
1940   FOR_EACH_VEC_ELT (datarefs, i, dr)
1941     {
1942       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1943       if (!vect_relevant_for_alignment_p (dr_info))
1944         continue;
1945
1946       stmt_vec_info stmt_info = dr_info->stmt;
1947       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1948       do_peeling = vector_alignment_reachable_p (dr_info);
1949       if (do_peeling)
1950         {
1951           if (known_alignment_for_access_p (dr_info, vectype))
1952             {
1953               unsigned int npeel_tmp = 0;
1954               bool negative = tree_int_cst_compare (DR_STEP (dr),
1955                                                     size_zero_node) < 0;
1956
1957               /* If known_alignment_for_access_p then we have set
1958                  DR_MISALIGNMENT which is only done if we know it at compiler
1959                  time, so it is safe to assume target alignment is constant.
1960                */
1961               unsigned int target_align =
1962                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1963               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1964               poly_int64 off = 0;
1965               if (negative)
1966                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1967               unsigned int mis = dr_misalignment (dr_info, vectype, off);
1968               mis = negative ? mis : -mis;
1969               if (mis != 0)
1970                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1971
1972               /* For multiple types, it is possible that the bigger type access
1973                  will have more than one peeling option.  E.g., a loop with two
1974                  types: one of size (vector size / 4), and the other one of
1975                  size (vector size / 8).  Vectorization factor will 8.  If both
1976                  accesses are misaligned by 3, the first one needs one scalar
1977                  iteration to be aligned, and the second one needs 5.  But the
1978                  first one will be aligned also by peeling 5 scalar
1979                  iterations, and in that case both accesses will be aligned.
1980                  Hence, except for the immediate peeling amount, we also want
1981                  to try to add full vector size, while we don't exceed
1982                  vectorization factor.
1983                  We do this automatically for cost model, since we calculate
1984                  cost for every peeling option.  */
1985               poly_uint64 nscalars = npeel_tmp;
1986               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1987                 {
1988                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989                   nscalars = (STMT_SLP_TYPE (stmt_info)
1990                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1991                 }
1992
1993               /* Save info about DR in the hash table.  Also include peeling
1994                  amounts according to the explanation above.  Indicate
1995                  the alignment status when the ref is not aligned.
1996                  ???  Rather than using unknown alignment here we should
1997                  prune all entries from the peeling hashtable which cause
1998                  DRs to be not supported.  */
1999               bool supportable_if_not_aligned
2000                 = vect_supportable_dr_alignment
2001                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2002               while (known_le (npeel_tmp, nscalars))
2003                 {
2004                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2005                                             dr_info, npeel_tmp,
2006                                             supportable_if_not_aligned);
2007                   npeel_tmp += MAX (1, target_align / dr_size);
2008                 }
2009
2010               one_misalignment_known = true;
2011             }
2012           else
2013             {
2014               /* If we don't know any misalignment values, we prefer
2015                  peeling for data-ref that has the maximum number of data-refs
2016                  with the same alignment, unless the target prefers to align
2017                  stores over load.  */
2018               unsigned same_align_drs = n_same_align_refs[i];
2019               if (!dr0_info
2020                   || dr0_same_align_drs < same_align_drs)
2021                 {
2022                   dr0_same_align_drs = same_align_drs;
2023                   dr0_info = dr_info;
2024                 }
2025               /* For data-refs with the same number of related
2026                  accesses prefer the one where the misalign
2027                  computation will be invariant in the outermost loop.  */
2028               else if (dr0_same_align_drs == same_align_drs)
2029                 {
2030                   class loop *ivloop0, *ivloop;
2031                   ivloop0 = outermost_invariant_loop_for_expr
2032                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2033                   ivloop = outermost_invariant_loop_for_expr
2034                     (loop, DR_BASE_ADDRESS (dr));
2035                   if ((ivloop && !ivloop0)
2036                       || (ivloop && ivloop0
2037                           && flow_loop_nested_p (ivloop, ivloop0)))
2038                     dr0_info = dr_info;
2039                 }
2040
2041               one_misalignment_unknown = true;
2042
2043               /* Check for data refs with unsupportable alignment that
2044                  can be peeled.  */
2045               enum dr_alignment_support supportable_dr_alignment
2046                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2047                                                  DR_MISALIGNMENT_UNKNOWN);
2048               if (supportable_dr_alignment == dr_unaligned_unsupported)
2049                 {
2050                   one_dr_unsupportable = true;
2051                   unsupportable_dr_info = dr_info;
2052                 }
2053
2054               if (!first_store && DR_IS_WRITE (dr))
2055                 {
2056                   first_store = dr_info;
2057                   first_store_same_align_drs = same_align_drs;
2058                 }
2059             }
2060         }
2061       else
2062         {
2063           if (!aligned_access_p (dr_info, vectype))
2064             {
2065               if (dump_enabled_p ())
2066                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2067                                  "vector alignment may not be reachable\n");
2068               break;
2069             }
2070         }
2071     }
2072
2073   /* Check if we can possibly peel the loop.  */
2074   if (!vect_can_advance_ivs_p (loop_vinfo)
2075       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2076       || loop->inner)
2077     do_peeling = false;
2078
2079   struct _vect_peel_extended_info peel_for_known_alignment;
2080   struct _vect_peel_extended_info peel_for_unknown_alignment;
2081   struct _vect_peel_extended_info best_peel;
2082
2083   peel_for_unknown_alignment.inside_cost = INT_MAX;
2084   peel_for_unknown_alignment.outside_cost = INT_MAX;
2085   peel_for_unknown_alignment.peel_info.count = 0;
2086
2087   if (do_peeling
2088       && one_misalignment_unknown)
2089     {
2090       /* Check if the target requires to prefer stores over loads, i.e., if
2091          misaligned stores are more expensive than misaligned loads (taking
2092          drs with same alignment into account).  */
2093       unsigned int load_inside_cost = 0;
2094       unsigned int load_outside_cost = 0;
2095       unsigned int store_inside_cost = 0;
2096       unsigned int store_outside_cost = 0;
2097       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2098
2099       stmt_vector_for_cost dummy;
2100       dummy.create (2);
2101       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2102                                       &load_inside_cost,
2103                                       &load_outside_cost,
2104                                       &dummy, &dummy, estimated_npeels);
2105       dummy.release ();
2106
2107       if (first_store)
2108         {
2109           dummy.create (2);
2110           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2111                                           &store_inside_cost,
2112                                           &store_outside_cost,
2113                                           &dummy, &dummy,
2114                                           estimated_npeels);
2115           dummy.release ();
2116         }
2117       else
2118         {
2119           store_inside_cost = INT_MAX;
2120           store_outside_cost = INT_MAX;
2121         }
2122
2123       if (load_inside_cost > store_inside_cost
2124           || (load_inside_cost == store_inside_cost
2125               && load_outside_cost > store_outside_cost))
2126         {
2127           dr0_info = first_store;
2128           dr0_same_align_drs = first_store_same_align_drs;
2129           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2130           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2131         }
2132       else
2133         {
2134           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2135           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2136         }
2137
2138       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2139       prologue_cost_vec.create (2);
2140       epilogue_cost_vec.create (2);
2141
2142       int dummy2;
2143       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2144         (loop_vinfo, estimated_npeels, &dummy2,
2145          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2146          &prologue_cost_vec, &epilogue_cost_vec);
2147
2148       prologue_cost_vec.release ();
2149       epilogue_cost_vec.release ();
2150
2151       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2152     }
2153
2154   peel_for_unknown_alignment.peel_info.npeel = 0;
2155   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2156
2157   best_peel = peel_for_unknown_alignment;
2158
2159   peel_for_known_alignment.inside_cost = INT_MAX;
2160   peel_for_known_alignment.outside_cost = INT_MAX;
2161   peel_for_known_alignment.peel_info.count = 0;
2162   peel_for_known_alignment.peel_info.dr_info = NULL;
2163
2164   if (do_peeling && one_misalignment_known)
2165     {
2166       /* Peeling is possible, but there is no data access that is not supported
2167          unless aligned.  So we try to choose the best possible peeling from
2168          the hash table.  */
2169       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2170         (&peeling_htab, loop_vinfo);
2171     }
2172
2173   /* Compare costs of peeling for known and unknown alignment. */
2174   if (peel_for_known_alignment.peel_info.dr_info != NULL
2175       && peel_for_unknown_alignment.inside_cost
2176       >= peel_for_known_alignment.inside_cost)
2177     {
2178       best_peel = peel_for_known_alignment;
2179
2180       /* If the best peeling for known alignment has NPEEL == 0, perform no
2181          peeling at all except if there is an unsupportable dr that we can
2182          align.  */
2183       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2184         do_peeling = false;
2185     }
2186
2187   /* If there is an unsupportable data ref, prefer this over all choices so far
2188      since we'd have to discard a chosen peeling except when it accidentally
2189      aligned the unsupportable data ref.  */
2190   if (one_dr_unsupportable)
2191     dr0_info = unsupportable_dr_info;
2192   else if (do_peeling)
2193     {
2194       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2195          TODO: Use nopeel_outside_cost or get rid of it?  */
2196       unsigned nopeel_inside_cost = 0;
2197       unsigned nopeel_outside_cost = 0;
2198
2199       stmt_vector_for_cost dummy;
2200       dummy.create (2);
2201       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2202                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2203       dummy.release ();
2204
2205       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2206          costs will be recorded.  */
2207       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2208       prologue_cost_vec.create (2);
2209       epilogue_cost_vec.create (2);
2210
2211       int dummy2;
2212       nopeel_outside_cost += vect_get_known_peeling_cost
2213         (loop_vinfo, 0, &dummy2,
2214          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2215          &prologue_cost_vec, &epilogue_cost_vec);
2216
2217       prologue_cost_vec.release ();
2218       epilogue_cost_vec.release ();
2219
2220       npeel = best_peel.peel_info.npeel;
2221       dr0_info = best_peel.peel_info.dr_info;
2222
2223       /* If no peeling is not more expensive than the best peeling we
2224          have so far, don't perform any peeling.  */
2225       if (nopeel_inside_cost <= best_peel.inside_cost)
2226         do_peeling = false;
2227     }
2228
2229   if (do_peeling)
2230     {
2231       stmt_vec_info stmt_info = dr0_info->stmt;
2232       if (known_alignment_for_access_p (dr0_info,
2233                                         STMT_VINFO_VECTYPE (stmt_info)))
2234         {
2235           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2236                                                 size_zero_node) < 0;
2237           if (!npeel)
2238             {
2239               /* Since it's known at compile time, compute the number of
2240                  iterations in the peeled loop (the peeling factor) for use in
2241                  updating DR_MISALIGNMENT values.  The peeling factor is the
2242                  vectorization factor minus the misalignment as an element
2243                  count.  */
2244               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2245               poly_int64 off = 0;
2246               if (negative)
2247                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2248                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2249               unsigned int mis
2250                 = dr_misalignment (dr0_info, vectype, off);
2251               mis = negative ? mis : -mis;
2252               /* If known_alignment_for_access_p then we have set
2253                  DR_MISALIGNMENT which is only done if we know it at compiler
2254                  time, so it is safe to assume target alignment is constant.
2255                */
2256               unsigned int target_align =
2257                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2258               npeel = ((mis & (target_align - 1))
2259                        / vect_get_scalar_dr_size (dr0_info));
2260             }
2261
2262           /* For interleaved data access every iteration accesses all the
2263              members of the group, therefore we divide the number of iterations
2264              by the group size.  */
2265           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2266             npeel /= DR_GROUP_SIZE (stmt_info);
2267
2268           if (dump_enabled_p ())
2269             dump_printf_loc (MSG_NOTE, vect_location,
2270                              "Try peeling by %d\n", npeel);
2271         }
2272
2273       /* Ensure that all datarefs can be vectorized after the peel.  */
2274       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2275         do_peeling = false;
2276
2277       /* Check if all datarefs are supportable and log.  */
2278       if (do_peeling
2279           && npeel == 0
2280           && known_alignment_for_access_p (dr0_info,
2281                                            STMT_VINFO_VECTYPE (stmt_info)))
2282         return opt_result::success ();
2283
2284       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2285       if (do_peeling)
2286         {
2287           unsigned max_allowed_peel
2288             = param_vect_max_peeling_for_alignment;
2289           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2290             max_allowed_peel = 0;
2291           if (max_allowed_peel != (unsigned)-1)
2292             {
2293               unsigned max_peel = npeel;
2294               if (max_peel == 0)
2295                 {
2296                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2297                   unsigned HOST_WIDE_INT target_align_c;
2298                   if (target_align.is_constant (&target_align_c))
2299                     max_peel =
2300                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2301                   else
2302                     {
2303                       do_peeling = false;
2304                       if (dump_enabled_p ())
2305                         dump_printf_loc (MSG_NOTE, vect_location,
2306                           "Disable peeling, max peels set and vector"
2307                           " alignment unknown\n");
2308                     }
2309                 }
2310               if (max_peel > max_allowed_peel)
2311                 {
2312                   do_peeling = false;
2313                   if (dump_enabled_p ())
2314                     dump_printf_loc (MSG_NOTE, vect_location,
2315                         "Disable peeling, max peels reached: %d\n", max_peel);
2316                 }
2317             }
2318         }
2319
2320       /* Cost model #2 - if peeling may result in a remaining loop not
2321          iterating enough to be vectorized then do not peel.  Since this
2322          is a cost heuristic rather than a correctness decision, use the
2323          most likely runtime value for variable vectorization factors.  */
2324       if (do_peeling
2325           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2326         {
2327           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2328           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2329           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2330               < assumed_vf + max_peel)
2331             do_peeling = false;
2332         }
2333
2334       if (do_peeling)
2335         {
2336           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2337              If the misalignment of DR_i is identical to that of dr0 then set
2338              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2339              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2340              by the peeling factor times the element size of DR_i (MOD the
2341              vectorization factor times the size).  Otherwise, the
2342              misalignment of DR_i must be set to unknown.  */
2343           FOR_EACH_VEC_ELT (datarefs, i, dr)
2344             if (dr != dr0_info->dr)
2345               {
2346                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2347                 if (!vect_relevant_for_alignment_p (dr_info))
2348                   continue;
2349
2350                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2351               }
2352
2353           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2354           if (npeel)
2355             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2356           else
2357             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2358           SET_DR_MISALIGNMENT (dr0_info,
2359                                vect_dr_misalign_for_aligned_access (dr0_info));
2360           if (dump_enabled_p ())
2361             {
2362               dump_printf_loc (MSG_NOTE, vect_location,
2363                                "Alignment of access forced using peeling.\n");
2364               dump_printf_loc (MSG_NOTE, vect_location,
2365                                "Peeling for alignment will be applied.\n");
2366             }
2367
2368           /* The inside-loop cost will be accounted for in vectorizable_load
2369              and vectorizable_store correctly with adjusted alignments.
2370              Drop the body_cst_vec on the floor here.  */
2371           return opt_result::success ();
2372         }
2373     }
2374
2375   /* (2) Versioning to force alignment.  */
2376
2377   /* Try versioning if:
2378      1) optimize loop for speed and the cost-model is not cheap
2379      2) there is at least one unsupported misaligned data ref with an unknown
2380         misalignment, and
2381      3) all misaligned data refs with a known misalignment are supported, and
2382      4) the number of runtime alignment checks is within reason.  */
2383
2384   do_versioning
2385     = (optimize_loop_nest_for_speed_p (loop)
2386        && !loop->inner /* FORNOW */
2387        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2388
2389   if (do_versioning)
2390     {
2391       FOR_EACH_VEC_ELT (datarefs, i, dr)
2392         {
2393           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2394           if (!vect_relevant_for_alignment_p (dr_info))
2395             continue;
2396
2397           stmt_vec_info stmt_info = dr_info->stmt;
2398           if (STMT_VINFO_STRIDED_P (stmt_info))
2399             {
2400               do_versioning = false;
2401               break;
2402             }
2403
2404           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2405           bool negative = tree_int_cst_compare (DR_STEP (dr),
2406                                                 size_zero_node) < 0;
2407           poly_int64 off = 0;
2408           if (negative)
2409             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2410                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2411           int misalignment;
2412           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2413             continue;
2414
2415           enum dr_alignment_support supportable_dr_alignment
2416             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2417                                              misalignment);
2418           if (supportable_dr_alignment == dr_unaligned_unsupported)
2419             {
2420               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2421                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2422                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2423                 {
2424                   do_versioning = false;
2425                   break;
2426                 }
2427
2428               /* At present we don't support versioning for alignment
2429                  with variable VF, since there's no guarantee that the
2430                  VF is a power of two.  We could relax this if we added
2431                  a way of enforcing a power-of-two size.  */
2432               unsigned HOST_WIDE_INT size;
2433               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2434                 {
2435                   do_versioning = false;
2436                   break;
2437                 }
2438
2439               /* Forcing alignment in the first iteration is no good if
2440                  we don't keep it across iterations.  For now, just disable
2441                  versioning in this case.
2442                  ?? We could actually unroll the loop to achieve the required
2443                  overall step alignment, and forcing the alignment could be
2444                  done by doing some iterations of the non-vectorized loop.  */
2445               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2446                                * DR_STEP_ALIGNMENT (dr),
2447                                DR_TARGET_ALIGNMENT (dr_info)))
2448                 {
2449                   do_versioning = false;
2450                   break;
2451                 }
2452
2453               /* The rightmost bits of an aligned address must be zeros.
2454                  Construct the mask needed for this test.  For example,
2455                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2456                  mask must be 15 = 0xf. */
2457               int mask = size - 1;
2458
2459               /* FORNOW: use the same mask to test all potentially unaligned
2460                  references in the loop.  */
2461               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2462                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2463                 {
2464                   do_versioning = false;
2465                   break;
2466                 }
2467
2468               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2469               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2470             }
2471         }
2472
2473       /* Versioning requires at least one misaligned data reference.  */
2474       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2475         do_versioning = false;
2476       else if (!do_versioning)
2477         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2478     }
2479
2480   if (do_versioning)
2481     {
2482       const vec<stmt_vec_info> &may_misalign_stmts
2483         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2484       stmt_vec_info stmt_info;
2485
2486       /* It can now be assumed that the data references in the statements
2487          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2488          of the loop being vectorized.  */
2489       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2490         {
2491           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2492           SET_DR_MISALIGNMENT (dr_info,
2493                                vect_dr_misalign_for_aligned_access (dr_info));
2494           if (dump_enabled_p ())
2495             dump_printf_loc (MSG_NOTE, vect_location,
2496                              "Alignment of access forced using versioning.\n");
2497         }
2498
2499       if (dump_enabled_p ())
2500         dump_printf_loc (MSG_NOTE, vect_location,
2501                          "Versioning for alignment will be applied.\n");
2502
2503       /* Peeling and versioning can't be done together at this time.  */
2504       gcc_assert (! (do_peeling && do_versioning));
2505
2506       return opt_result::success ();
2507     }
2508
2509   /* This point is reached if neither peeling nor versioning is being done.  */
2510   gcc_assert (! (do_peeling || do_versioning));
2511
2512   return opt_result::success ();
2513 }
2514
2515
2516 /* Function vect_analyze_data_refs_alignment
2517
2518    Analyze the alignment of the data-references in the loop.
2519    Return FALSE if a data reference is found that cannot be vectorized.  */
2520
2521 opt_result
2522 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2523 {
2524   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2525
2526   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2527   struct data_reference *dr;
2528   unsigned int i;
2529
2530   vect_record_base_alignments (loop_vinfo);
2531   FOR_EACH_VEC_ELT (datarefs, i, dr)
2532     {
2533       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2534       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2535         {
2536           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2537               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2538             continue;
2539           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2540                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2541         }
2542     }
2543
2544   return opt_result::success ();
2545 }
2546
2547
2548 /* Analyze alignment of DRs of stmts in NODE.  */
2549
2550 static bool
2551 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2552 {
2553   /* Alignment is maintained in the first element of the group.  */
2554   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2555   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2556   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2557   tree vectype = SLP_TREE_VECTYPE (node);
2558   poly_uint64 vector_alignment
2559     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2560                  BITS_PER_UNIT);
2561   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2562     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2563   /* Re-analyze alignment when we're facing a vectorization with a bigger
2564      alignment requirement.  */
2565   else if (known_lt (dr_info->target_alignment, vector_alignment))
2566     {
2567       poly_uint64 old_target_alignment = dr_info->target_alignment;
2568       int old_misalignment = dr_info->misalignment;
2569       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2570       /* But keep knowledge about a smaller alignment.  */
2571       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2572           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2573         {
2574           dr_info->target_alignment = old_target_alignment;
2575           dr_info->misalignment = old_misalignment;
2576         }
2577     }
2578   /* When we ever face unordered target alignments the first one wins in terms
2579      of analyzing and the other will become unknown in dr_misalignment.  */
2580   return true;
2581 }
2582
2583 /* Function vect_slp_analyze_instance_alignment
2584
2585    Analyze the alignment of the data-references in the SLP instance.
2586    Return FALSE if a data reference is found that cannot be vectorized.  */
2587
2588 bool
2589 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2590                                                 slp_instance instance)
2591 {
2592   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2593
2594   slp_tree node;
2595   unsigned i;
2596   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2597     if (! vect_slp_analyze_node_alignment (vinfo, node))
2598       return false;
2599
2600   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2601       && ! vect_slp_analyze_node_alignment
2602              (vinfo, SLP_INSTANCE_TREE (instance)))
2603     return false;
2604
2605   return true;
2606 }
2607
2608
2609 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2610    accesses of legal size, step, etc.  Detect gaps, single element
2611    interleaving, and other special cases. Set grouped access info.
2612    Collect groups of strided stores for further use in SLP analysis.
2613    Worker for vect_analyze_group_access.  */
2614
2615 static bool
2616 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2617 {
2618   data_reference *dr = dr_info->dr;
2619   tree step = DR_STEP (dr);
2620   tree scalar_type = TREE_TYPE (DR_REF (dr));
2621   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2622   stmt_vec_info stmt_info = dr_info->stmt;
2623   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2624   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2625   HOST_WIDE_INT dr_step = -1;
2626   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2627   bool slp_impossible = false;
2628
2629   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2630      size of the interleaving group (including gaps).  */
2631   if (tree_fits_shwi_p (step))
2632     {
2633       dr_step = tree_to_shwi (step);
2634       /* Check that STEP is a multiple of type size.  Otherwise there is
2635          a non-element-sized gap at the end of the group which we
2636          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2637          ???  As we can handle non-constant step fine here we should
2638          simply remove uses of DR_GROUP_GAP between the last and first
2639          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2640          simply not include that gap.  */
2641       if ((dr_step % type_size) != 0)
2642         {
2643           if (dump_enabled_p ())
2644             dump_printf_loc (MSG_NOTE, vect_location,
2645                              "Step %T is not a multiple of the element size"
2646                              " for %T\n",
2647                              step, DR_REF (dr));
2648           return false;
2649         }
2650       groupsize = absu_hwi (dr_step) / type_size;
2651     }
2652   else
2653     groupsize = 0;
2654
2655   /* Not consecutive access is possible only if it is a part of interleaving.  */
2656   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2657     {
2658       /* Check if it this DR is a part of interleaving, and is a single
2659          element of the group that is accessed in the loop.  */
2660
2661       /* Gaps are supported only for loads. STEP must be a multiple of the type
2662          size.  */
2663       if (DR_IS_READ (dr)
2664           && (dr_step % type_size) == 0
2665           && groupsize > 0
2666           /* This could be UINT_MAX but as we are generating code in a very
2667              inefficient way we have to cap earlier.
2668              See PR91403 for example.  */
2669           && groupsize <= 4096)
2670         {
2671           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2672           DR_GROUP_SIZE (stmt_info) = groupsize;
2673           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2674           if (dump_enabled_p ())
2675             dump_printf_loc (MSG_NOTE, vect_location,
2676                              "Detected single element interleaving %T"
2677                              " step %T\n",
2678                              DR_REF (dr), step);
2679
2680           return true;
2681         }
2682
2683       if (dump_enabled_p ())
2684         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2685                          "not consecutive access %G", stmt_info->stmt);
2686
2687       if (bb_vinfo)
2688         {
2689           /* Mark the statement as unvectorizable.  */
2690           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2691           return true;
2692         }
2693
2694       if (dump_enabled_p ())
2695         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2696       STMT_VINFO_STRIDED_P (stmt_info) = true;
2697       return true;
2698     }
2699
2700   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2701     {
2702       /* First stmt in the interleaving chain. Check the chain.  */
2703       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2704       struct data_reference *data_ref = dr;
2705       unsigned int count = 1;
2706       tree prev_init = DR_INIT (data_ref);
2707       HOST_WIDE_INT diff, gaps = 0;
2708
2709       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2710       while (next)
2711         {
2712           /* We never have the same DR multiple times.  */
2713           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2714                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2715
2716           data_ref = STMT_VINFO_DATA_REF (next);
2717
2718           /* All group members have the same STEP by construction.  */
2719           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2720
2721           /* Check that the distance between two accesses is equal to the type
2722              size. Otherwise, we have gaps.  */
2723           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2724                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2725           if (diff < 1 || diff > UINT_MAX)
2726             {
2727               /* For artificial testcases with array accesses with large
2728                  constant indices we can run into overflow issues which
2729                  can end up fooling the groupsize constraint below so
2730                  check the individual gaps (which are represented as
2731                  unsigned int) as well.  */
2732               if (dump_enabled_p ())
2733                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2734                                  "interleaved access with gap larger "
2735                                  "than representable\n");
2736               return false;
2737             }
2738           if (diff != 1)
2739             {
2740               /* FORNOW: SLP of accesses with gaps is not supported.  */
2741               slp_impossible = true;
2742               if (DR_IS_WRITE (data_ref))
2743                 {
2744                   if (dump_enabled_p ())
2745                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2746                                      "interleaved store with gaps\n");
2747                   return false;
2748                 }
2749
2750               gaps += diff - 1;
2751             }
2752
2753           last_accessed_element += diff;
2754
2755           /* Store the gap from the previous member of the group. If there is no
2756              gap in the access, DR_GROUP_GAP is always 1.  */
2757           DR_GROUP_GAP (next) = diff;
2758
2759           prev_init = DR_INIT (data_ref);
2760           next = DR_GROUP_NEXT_ELEMENT (next);
2761           /* Count the number of data-refs in the chain.  */
2762           count++;
2763         }
2764
2765       if (groupsize == 0)
2766         groupsize = count + gaps;
2767
2768       /* This could be UINT_MAX but as we are generating code in a very
2769          inefficient way we have to cap earlier.  See PR78699 for example.  */
2770       if (groupsize > 4096)
2771         {
2772           if (dump_enabled_p ())
2773             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2774                              "group is too large\n");
2775           return false;
2776         }
2777
2778       /* Check that the size of the interleaving is equal to count for stores,
2779          i.e., that there are no gaps.  */
2780       if (groupsize != count
2781           && !DR_IS_READ (dr))
2782         {
2783           groupsize = count;
2784           STMT_VINFO_STRIDED_P (stmt_info) = true;
2785         }
2786
2787       /* If there is a gap after the last load in the group it is the
2788          difference between the groupsize and the last accessed
2789          element.
2790          When there is no gap, this difference should be 0.  */
2791       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2792
2793       DR_GROUP_SIZE (stmt_info) = groupsize;
2794       if (dump_enabled_p ())
2795         {
2796           dump_printf_loc (MSG_NOTE, vect_location,
2797                            "Detected interleaving ");
2798           if (DR_IS_READ (dr))
2799             dump_printf (MSG_NOTE, "load ");
2800           else if (STMT_VINFO_STRIDED_P (stmt_info))
2801             dump_printf (MSG_NOTE, "strided store ");
2802           else
2803             dump_printf (MSG_NOTE, "store ");
2804           dump_printf (MSG_NOTE, "of size %u\n",
2805                        (unsigned)groupsize);
2806           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2807           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2808           while (next)
2809             {
2810               if (DR_GROUP_GAP (next) != 1)
2811                 dump_printf_loc (MSG_NOTE, vect_location,
2812                                  "\t<gap of %d elements>\n",
2813                                  DR_GROUP_GAP (next) - 1);
2814               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2815               next = DR_GROUP_NEXT_ELEMENT (next);
2816             }
2817           if (DR_GROUP_GAP (stmt_info) != 0)
2818             dump_printf_loc (MSG_NOTE, vect_location,
2819                              "\t<gap of %d elements>\n",
2820                              DR_GROUP_GAP (stmt_info));
2821         }
2822
2823       /* SLP: create an SLP data structure for every interleaving group of
2824          stores for further analysis in vect_analyse_slp.  */
2825       if (DR_IS_WRITE (dr) && !slp_impossible)
2826         {
2827           if (loop_vinfo)
2828             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2829           if (bb_vinfo)
2830             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2831         }
2832     }
2833
2834   return true;
2835 }
2836
2837 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2838    accesses of legal size, step, etc.  Detect gaps, single element
2839    interleaving, and other special cases. Set grouped access info.
2840    Collect groups of strided stores for further use in SLP analysis.  */
2841
2842 static bool
2843 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2844 {
2845   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2846     {
2847       /* Dissolve the group if present.  */
2848       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2849       while (stmt_info)
2850         {
2851           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2852           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2853           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2854           stmt_info = next;
2855         }
2856       return false;
2857     }
2858   return true;
2859 }
2860
2861 /* Analyze the access pattern of the data-reference DR_INFO.
2862    In case of non-consecutive accesses call vect_analyze_group_access() to
2863    analyze groups of accesses.  */
2864
2865 static bool
2866 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2867 {
2868   data_reference *dr = dr_info->dr;
2869   tree step = DR_STEP (dr);
2870   tree scalar_type = TREE_TYPE (DR_REF (dr));
2871   stmt_vec_info stmt_info = dr_info->stmt;
2872   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2873   class loop *loop = NULL;
2874
2875   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2876     return true;
2877
2878   if (loop_vinfo)
2879     loop = LOOP_VINFO_LOOP (loop_vinfo);
2880
2881   if (loop_vinfo && !step)
2882     {
2883       if (dump_enabled_p ())
2884         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2885                          "bad data-ref access in loop\n");
2886       return false;
2887     }
2888
2889   /* Allow loads with zero step in inner-loop vectorization.  */
2890   if (loop_vinfo && integer_zerop (step))
2891     {
2892       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2893       if (!nested_in_vect_loop_p (loop, stmt_info))
2894         return DR_IS_READ (dr);
2895       /* Allow references with zero step for outer loops marked
2896          with pragma omp simd only - it guarantees absence of
2897          loop-carried dependencies between inner loop iterations.  */
2898       if (loop->safelen < 2)
2899         {
2900           if (dump_enabled_p ())
2901             dump_printf_loc (MSG_NOTE, vect_location,
2902                              "zero step in inner loop of nest\n");
2903           return false;
2904         }
2905     }
2906
2907   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2908     {
2909       /* Interleaved accesses are not yet supported within outer-loop
2910         vectorization for references in the inner-loop.  */
2911       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2912
2913       /* For the rest of the analysis we use the outer-loop step.  */
2914       step = STMT_VINFO_DR_STEP (stmt_info);
2915       if (integer_zerop (step))
2916         {
2917           if (dump_enabled_p ())
2918             dump_printf_loc (MSG_NOTE, vect_location,
2919                              "zero step in outer loop.\n");
2920           return DR_IS_READ (dr);
2921         }
2922     }
2923
2924   /* Consecutive?  */
2925   if (TREE_CODE (step) == INTEGER_CST)
2926     {
2927       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2928       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2929           || (dr_step < 0
2930               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2931         {
2932           /* Mark that it is not interleaving.  */
2933           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2934           return true;
2935         }
2936     }
2937
2938   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2939     {
2940       if (dump_enabled_p ())
2941         dump_printf_loc (MSG_NOTE, vect_location,
2942                          "grouped access in outer loop.\n");
2943       return false;
2944     }
2945
2946
2947   /* Assume this is a DR handled by non-constant strided load case.  */
2948   if (TREE_CODE (step) != INTEGER_CST)
2949     return (STMT_VINFO_STRIDED_P (stmt_info)
2950             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2951                 || vect_analyze_group_access (vinfo, dr_info)));
2952
2953   /* Not consecutive access - check if it's a part of interleaving group.  */
2954   return vect_analyze_group_access (vinfo, dr_info);
2955 }
2956
2957 /* Compare two data-references DRA and DRB to group them into chunks
2958    suitable for grouping.  */
2959
2960 static int
2961 dr_group_sort_cmp (const void *dra_, const void *drb_)
2962 {
2963   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2964   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2965   data_reference_p dra = dra_info->dr;
2966   data_reference_p drb = drb_info->dr;
2967   int cmp;
2968
2969   /* Stabilize sort.  */
2970   if (dra == drb)
2971     return 0;
2972
2973   /* Different group IDs lead never belong to the same group.  */
2974   if (dra_info->group != drb_info->group)
2975     return dra_info->group < drb_info->group ? -1 : 1;
2976
2977   /* Ordering of DRs according to base.  */
2978   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2979                                DR_BASE_ADDRESS (drb));
2980   if (cmp != 0)
2981     return cmp;
2982
2983   /* And according to DR_OFFSET.  */
2984   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2985   if (cmp != 0)
2986     return cmp;
2987
2988   /* Put reads before writes.  */
2989   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2990     return DR_IS_READ (dra) ? -1 : 1;
2991
2992   /* Then sort after access size.  */
2993   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2994                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2995   if (cmp != 0)
2996     return cmp;
2997
2998   /* And after step.  */
2999   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3000   if (cmp != 0)
3001     return cmp;
3002
3003   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3004   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3005   if (cmp == 0)
3006     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3007   return cmp;
3008 }
3009
3010 /* If OP is the result of a conversion, return the unconverted value,
3011    otherwise return null.  */
3012
3013 static tree
3014 strip_conversion (tree op)
3015 {
3016   if (TREE_CODE (op) != SSA_NAME)
3017     return NULL_TREE;
3018   gimple *stmt = SSA_NAME_DEF_STMT (op);
3019   if (!is_gimple_assign (stmt)
3020       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3021     return NULL_TREE;
3022   return gimple_assign_rhs1 (stmt);
3023 }
3024
3025 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3026    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3027    be grouped in SLP mode.  */
3028
3029 static bool
3030 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3031                    bool allow_slp_p)
3032 {
3033   if (gimple_assign_single_p (stmt1_info->stmt))
3034     return gimple_assign_single_p (stmt2_info->stmt);
3035
3036   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3037   if (call1 && gimple_call_internal_p (call1))
3038     {
3039       /* Check for two masked loads or two masked stores.  */
3040       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3041       if (!call2 || !gimple_call_internal_p (call2))
3042         return false;
3043       internal_fn ifn = gimple_call_internal_fn (call1);
3044       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3045         return false;
3046       if (ifn != gimple_call_internal_fn (call2))
3047         return false;
3048
3049       /* Check that the masks are the same.  Cope with casts of masks,
3050          like those created by build_mask_conversion.  */
3051       tree mask1 = gimple_call_arg (call1, 2);
3052       tree mask2 = gimple_call_arg (call2, 2);
3053       if (!operand_equal_p (mask1, mask2, 0)
3054           && (ifn == IFN_MASK_STORE || !allow_slp_p))
3055         {
3056           mask1 = strip_conversion (mask1);
3057           if (!mask1)
3058             return false;
3059           mask2 = strip_conversion (mask2);
3060           if (!mask2)
3061             return false;
3062           if (!operand_equal_p (mask1, mask2, 0))
3063             return false;
3064         }
3065       return true;
3066     }
3067
3068   return false;
3069 }
3070
3071 /* Function vect_analyze_data_ref_accesses.
3072
3073    Analyze the access pattern of all the data references in the loop.
3074
3075    FORNOW: the only access pattern that is considered vectorizable is a
3076            simple step 1 (consecutive) access.
3077
3078    FORNOW: handle only arrays and pointer accesses.  */
3079
3080 opt_result
3081 vect_analyze_data_ref_accesses (vec_info *vinfo,
3082                                 vec<int> *dataref_groups)
3083 {
3084   unsigned int i;
3085   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3086
3087   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3088
3089   if (datarefs.is_empty ())
3090     return opt_result::success ();
3091
3092   /* Sort the array of datarefs to make building the interleaving chains
3093      linear.  Don't modify the original vector's order, it is needed for
3094      determining what dependencies are reversed.  */
3095   vec<dr_vec_info *> datarefs_copy;
3096   datarefs_copy.create (datarefs.length ());
3097   for (unsigned i = 0; i < datarefs.length (); i++)
3098     {
3099       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3100       /* If the caller computed DR grouping use that, otherwise group by
3101          basic blocks.  */
3102       if (dataref_groups)
3103         dr_info->group = (*dataref_groups)[i];
3104       else
3105         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3106       datarefs_copy.quick_push (dr_info);
3107     }
3108   datarefs_copy.qsort (dr_group_sort_cmp);
3109   hash_set<stmt_vec_info> to_fixup;
3110
3111   /* Build the interleaving chains.  */
3112   for (i = 0; i < datarefs_copy.length () - 1;)
3113     {
3114       dr_vec_info *dr_info_a = datarefs_copy[i];
3115       data_reference_p dra = dr_info_a->dr;
3116       int dra_group_id = dr_info_a->group;
3117       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3118       stmt_vec_info lastinfo = NULL;
3119       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3120           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3121         {
3122           ++i;
3123           continue;
3124         }
3125       for (i = i + 1; i < datarefs_copy.length (); ++i)
3126         {
3127           dr_vec_info *dr_info_b = datarefs_copy[i];
3128           data_reference_p drb = dr_info_b->dr;
3129           int drb_group_id = dr_info_b->group;
3130           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3131           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3132               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3133             break;
3134
3135           /* ???  Imperfect sorting (non-compatible types, non-modulo
3136              accesses, same accesses) can lead to a group to be artificially
3137              split here as we don't just skip over those.  If it really
3138              matters we can push those to a worklist and re-iterate
3139              over them.  The we can just skip ahead to the next DR here.  */
3140
3141           /* DRs in a different DR group should not be put into the same
3142              interleaving group.  */
3143           if (dra_group_id != drb_group_id)
3144             break;
3145
3146           /* Check that the data-refs have same first location (except init)
3147              and they are both either store or load (not load and store,
3148              not masked loads or stores).  */
3149           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3150               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3151                                         DR_BASE_ADDRESS (drb)) != 0
3152               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3153               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3154             break;
3155
3156           /* Check that the data-refs have the same constant size.  */
3157           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3158           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3159           if (!tree_fits_uhwi_p (sza)
3160               || !tree_fits_uhwi_p (szb)
3161               || !tree_int_cst_equal (sza, szb))
3162             break;
3163
3164           /* Check that the data-refs have the same step.  */
3165           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3166             break;
3167
3168           /* Check the types are compatible.
3169              ???  We don't distinguish this during sorting.  */
3170           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3171                                    TREE_TYPE (DR_REF (drb))))
3172             break;
3173
3174           /* Check that the DR_INITs are compile-time constants.  */
3175           if (!tree_fits_shwi_p (DR_INIT (dra))
3176               || !tree_fits_shwi_p (DR_INIT (drb)))
3177             break;
3178
3179           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3180              just hold extra information.  */
3181           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3182               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3183               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3184             break;
3185
3186           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3187           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3188           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3189           HOST_WIDE_INT init_prev
3190             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3191           gcc_assert (init_a <= init_b
3192                       && init_a <= init_prev
3193                       && init_prev <= init_b);
3194
3195           /* Do not place the same access in the interleaving chain twice.  */
3196           if (init_b == init_prev)
3197             {
3198               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3199                           < gimple_uid (DR_STMT (drb)));
3200               /* Simply link in duplicates and fix up the chain below.  */
3201             }
3202           else
3203             {
3204               /* If init_b == init_a + the size of the type * k, we have an
3205                  interleaving, and DRA is accessed before DRB.  */
3206               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3207               if (type_size_a == 0
3208                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3209                       % type_size_a != 0))
3210                 break;
3211
3212               /* If we have a store, the accesses are adjacent.  This splits
3213                  groups into chunks we support (we don't support vectorization
3214                  of stores with gaps).  */
3215               if (!DR_IS_READ (dra)
3216                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3217                       != type_size_a))
3218                 break;
3219
3220               /* If the step (if not zero or non-constant) is smaller than the
3221                  difference between data-refs' inits this splits groups into
3222                  suitable sizes.  */
3223               if (tree_fits_shwi_p (DR_STEP (dra)))
3224                 {
3225                   unsigned HOST_WIDE_INT step
3226                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3227                   if (step != 0
3228                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3229                     break;
3230                 }
3231             }
3232
3233           if (dump_enabled_p ())
3234             dump_printf_loc (MSG_NOTE, vect_location,
3235                              DR_IS_READ (dra)
3236                              ? "Detected interleaving load %T and %T\n"
3237                              : "Detected interleaving store %T and %T\n",
3238                              DR_REF (dra), DR_REF (drb));
3239
3240           /* Link the found element into the group list.  */
3241           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3242             {
3243               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3244               lastinfo = stmtinfo_a;
3245             }
3246           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3247           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3248           lastinfo = stmtinfo_b;
3249
3250           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3251             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3252
3253           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3254             dump_printf_loc (MSG_NOTE, vect_location,
3255                              "Load suitable for SLP vectorization only.\n");
3256
3257           if (init_b == init_prev
3258               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3259               && dump_enabled_p ())
3260             dump_printf_loc (MSG_NOTE, vect_location,
3261                              "Queuing group with duplicate access for fixup\n");
3262         }
3263     }
3264
3265   /* Fixup groups with duplicate entries by splitting it.  */
3266   while (1)
3267     {
3268       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3269       if (!(it != to_fixup.end ()))
3270         break;
3271       stmt_vec_info grp = *it;
3272       to_fixup.remove (grp);
3273
3274       /* Find the earliest duplicate group member.  */
3275       unsigned first_duplicate = -1u;
3276       stmt_vec_info next, g = grp;
3277       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3278         {
3279           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3280                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3281               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3282             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3283           g = next;
3284         }
3285       if (first_duplicate == -1U)
3286         continue;
3287
3288       /* Then move all stmts after the first duplicate to a new group.
3289          Note this is a heuristic but one with the property that *it
3290          is fixed up completely.  */
3291       g = grp;
3292       stmt_vec_info newgroup = NULL, ng = grp;
3293       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3294         {
3295           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3296             {
3297               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3298               if (!newgroup)
3299                 newgroup = next;
3300               else
3301                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3302               ng = next;
3303               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3304             }
3305           else
3306             g = DR_GROUP_NEXT_ELEMENT (g);
3307         }
3308       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3309
3310       /* Fixup the new group which still may contain duplicates.  */
3311       to_fixup.add (newgroup);
3312     }
3313
3314   dr_vec_info *dr_info;
3315   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3316     {
3317       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3318           && !vect_analyze_data_ref_access (vinfo, dr_info))
3319         {
3320           if (dump_enabled_p ())
3321             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3322                              "not vectorized: complicated access pattern.\n");
3323
3324           if (is_a <bb_vec_info> (vinfo))
3325             {
3326               /* Mark the statement as not vectorizable.  */
3327               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3328               continue;
3329             }
3330           else
3331             {
3332               datarefs_copy.release ();
3333               return opt_result::failure_at (dr_info->stmt->stmt,
3334                                              "not vectorized:"
3335                                              " complicated access pattern.\n");
3336             }
3337         }
3338     }
3339
3340   datarefs_copy.release ();
3341   return opt_result::success ();
3342 }
3343
3344 /* Function vect_vfa_segment_size.
3345
3346    Input:
3347      DR_INFO: The data reference.
3348      LENGTH_FACTOR: segment length to consider.
3349
3350    Return a value suitable for the dr_with_seg_len::seg_len field.
3351    This is the "distance travelled" by the pointer from the first
3352    iteration in the segment to the last.  Note that it does not include
3353    the size of the access; in effect it only describes the first byte.  */
3354
3355 static tree
3356 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3357 {
3358   length_factor = size_binop (MINUS_EXPR,
3359                               fold_convert (sizetype, length_factor),
3360                               size_one_node);
3361   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3362                      length_factor);
3363 }
3364
3365 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3366    gives the worst-case number of bytes covered by the segment.  */
3367
3368 static unsigned HOST_WIDE_INT
3369 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3370 {
3371   stmt_vec_info stmt_vinfo = dr_info->stmt;
3372   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3373   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3374   unsigned HOST_WIDE_INT access_size = ref_size;
3375   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3376     {
3377       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3378       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3379     }
3380   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3381   int misalignment;
3382   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3383       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3384       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3385           == dr_explicit_realign_optimized))
3386     {
3387       /* We might access a full vector's worth.  */
3388       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3389     }
3390   return access_size;
3391 }
3392
3393 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3394    describes.  */
3395
3396 static unsigned int
3397 vect_vfa_align (dr_vec_info *dr_info)
3398 {
3399   return dr_alignment (dr_info->dr);
3400 }
3401
3402 /* Function vect_no_alias_p.
3403
3404    Given data references A and B with equal base and offset, see whether
3405    the alias relation can be decided at compilation time.  Return 1 if
3406    it can and the references alias, 0 if it can and the references do
3407    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3408    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3409    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3410
3411 static int
3412 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3413                          tree segment_length_a, tree segment_length_b,
3414                          unsigned HOST_WIDE_INT access_size_a,
3415                          unsigned HOST_WIDE_INT access_size_b)
3416 {
3417   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3418   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3419   poly_uint64 const_length_a;
3420   poly_uint64 const_length_b;
3421
3422   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3423      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3424      [a, a+12) */
3425   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3426     {
3427       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3428       offset_a -= const_length_a;
3429     }
3430   else
3431     const_length_a = tree_to_poly_uint64 (segment_length_a);
3432   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3433     {
3434       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3435       offset_b -= const_length_b;
3436     }
3437   else
3438     const_length_b = tree_to_poly_uint64 (segment_length_b);
3439
3440   const_length_a += access_size_a;
3441   const_length_b += access_size_b;
3442
3443   if (ranges_known_overlap_p (offset_a, const_length_a,
3444                               offset_b, const_length_b))
3445     return 1;
3446
3447   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3448                                offset_b, const_length_b))
3449     return 0;
3450
3451   return -1;
3452 }
3453
3454 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3455    in DDR is >= VF.  */
3456
3457 static bool
3458 dependence_distance_ge_vf (data_dependence_relation *ddr,
3459                            unsigned int loop_depth, poly_uint64 vf)
3460 {
3461   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3462       || DDR_NUM_DIST_VECTS (ddr) == 0)
3463     return false;
3464
3465   /* If the dependence is exact, we should have limited the VF instead.  */
3466   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3467
3468   unsigned int i;
3469   lambda_vector dist_v;
3470   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3471     {
3472       HOST_WIDE_INT dist = dist_v[loop_depth];
3473       if (dist != 0
3474           && !(dist > 0 && DDR_REVERSED_P (ddr))
3475           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3476         return false;
3477     }
3478
3479   if (dump_enabled_p ())
3480     dump_printf_loc (MSG_NOTE, vect_location,
3481                      "dependence distance between %T and %T is >= VF\n",
3482                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3483
3484   return true;
3485 }
3486
3487 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3488
3489 static void
3490 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3491 {
3492   dump_printf (dump_kind, "%s (%T) >= ",
3493                lower_bound.unsigned_p ? "unsigned" : "abs",
3494                lower_bound.expr);
3495   dump_dec (dump_kind, lower_bound.min_value);
3496 }
3497
3498 /* Record that the vectorized loop requires the vec_lower_bound described
3499    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3500
3501 static void
3502 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3503                         poly_uint64 min_value)
3504 {
3505   vec<vec_lower_bound> &lower_bounds
3506     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3507   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3508     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3509       {
3510         unsigned_p &= lower_bounds[i].unsigned_p;
3511         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3512         if (lower_bounds[i].unsigned_p != unsigned_p
3513             || maybe_lt (lower_bounds[i].min_value, min_value))
3514           {
3515             lower_bounds[i].unsigned_p = unsigned_p;
3516             lower_bounds[i].min_value = min_value;
3517             if (dump_enabled_p ())
3518               {
3519                 dump_printf_loc (MSG_NOTE, vect_location,
3520                                  "updating run-time check to ");
3521                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3522                 dump_printf (MSG_NOTE, "\n");
3523               }
3524           }
3525         return;
3526       }
3527
3528   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3529   if (dump_enabled_p ())
3530     {
3531       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3532       dump_lower_bound (MSG_NOTE, lower_bound);
3533       dump_printf (MSG_NOTE, "\n");
3534     }
3535   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3536 }
3537
3538 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3539    will span fewer than GAP bytes.  */
3540
3541 static bool
3542 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3543                   poly_int64 gap)
3544 {
3545   stmt_vec_info stmt_info = dr_info->stmt;
3546   HOST_WIDE_INT count
3547     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3548   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3549     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3550   return (estimated_poly_value (gap)
3551           <= count * vect_get_scalar_dr_size (dr_info));
3552 }
3553
3554 /* Return true if we know that there is no alias between DR_INFO_A and
3555    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3556    When returning true, set *LOWER_BOUND_OUT to this N.  */
3557
3558 static bool
3559 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3560                                 poly_uint64 *lower_bound_out)
3561 {
3562   /* Check that there is a constant gap of known sign between DR_A
3563      and DR_B.  */
3564   data_reference *dr_a = dr_info_a->dr;
3565   data_reference *dr_b = dr_info_b->dr;
3566   poly_int64 init_a, init_b;
3567   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3568       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3569       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3570       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3571       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3572       || !ordered_p (init_a, init_b))
3573     return false;
3574
3575   /* Sort DR_A and DR_B by the address they access.  */
3576   if (maybe_lt (init_b, init_a))
3577     {
3578       std::swap (init_a, init_b);
3579       std::swap (dr_info_a, dr_info_b);
3580       std::swap (dr_a, dr_b);
3581     }
3582
3583   /* If the two accesses could be dependent within a scalar iteration,
3584      make sure that we'd retain their order.  */
3585   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3586       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3587     return false;
3588
3589   /* There is no alias if abs (DR_STEP) is greater than or equal to
3590      the bytes spanned by the combination of the two accesses.  */
3591   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3592   return true;
3593 }
3594
3595 /* Function vect_prune_runtime_alias_test_list.
3596
3597    Prune a list of ddrs to be tested at run-time by versioning for alias.
3598    Merge several alias checks into one if possible.
3599    Return FALSE if resulting list of ddrs is longer then allowed by
3600    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3601
3602 opt_result
3603 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3604 {
3605   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3606   hash_set <tree_pair_hash> compared_objects;
3607
3608   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3609   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3610     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3611   const vec<vec_object_pair> &check_unequal_addrs
3612     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3613   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3614   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3615
3616   ddr_p ddr;
3617   unsigned int i;
3618   tree length_factor;
3619
3620   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3621
3622   /* Step values are irrelevant for aliasing if the number of vector
3623      iterations is equal to the number of scalar iterations (which can
3624      happen for fully-SLP loops).  */
3625   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3626
3627   if (!vf_one_p)
3628     {
3629       /* Convert the checks for nonzero steps into bound tests.  */
3630       tree value;
3631       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3632         vect_check_lower_bound (loop_vinfo, value, true, 1);
3633     }
3634
3635   if (may_alias_ddrs.is_empty ())
3636     return opt_result::success ();
3637
3638   comp_alias_ddrs.create (may_alias_ddrs.length ());
3639
3640   unsigned int loop_depth
3641     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3642                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3643
3644   /* First, we collect all data ref pairs for aliasing checks.  */
3645   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3646     {
3647       poly_uint64 lower_bound;
3648       tree segment_length_a, segment_length_b;
3649       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3650       unsigned int align_a, align_b;
3651
3652       /* Ignore the alias if the VF we chose ended up being no greater
3653          than the dependence distance.  */
3654       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3655         continue;
3656
3657       if (DDR_OBJECT_A (ddr))
3658         {
3659           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3660           if (!compared_objects.add (new_pair))
3661             {
3662               if (dump_enabled_p ())
3663                 dump_printf_loc (MSG_NOTE, vect_location,
3664                                  "checking that %T and %T"
3665                                  " have different addresses\n",
3666                                  new_pair.first, new_pair.second);
3667               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3668             }
3669           continue;
3670         }
3671
3672       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3673       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3674
3675       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3676       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3677
3678       bool preserves_scalar_order_p
3679         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3680       bool ignore_step_p
3681           = (vf_one_p
3682              && (preserves_scalar_order_p
3683                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3684                                      DR_STEP (dr_info_b->dr))));
3685
3686       /* Skip the pair if inter-iteration dependencies are irrelevant
3687          and intra-iteration dependencies are guaranteed to be honored.  */
3688       if (ignore_step_p
3689           && (preserves_scalar_order_p
3690               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3691                                                  &lower_bound)))
3692         {
3693           if (dump_enabled_p ())
3694             dump_printf_loc (MSG_NOTE, vect_location,
3695                              "no need for alias check between "
3696                              "%T and %T when VF is 1\n",
3697                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3698           continue;
3699         }
3700
3701       /* See whether we can handle the alias using a bounds check on
3702          the step, and whether that's likely to be the best approach.
3703          (It might not be, for example, if the minimum step is much larger
3704          than the number of bytes handled by one vector iteration.)  */
3705       if (!ignore_step_p
3706           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3707           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3708                                              &lower_bound)
3709           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3710               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3711         {
3712           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3713           if (dump_enabled_p ())
3714             {
3715               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3716                                "%T and %T when the step %T is outside ",
3717                                DR_REF (dr_info_a->dr),
3718                                DR_REF (dr_info_b->dr),
3719                                DR_STEP (dr_info_a->dr));
3720               if (unsigned_p)
3721                 dump_printf (MSG_NOTE, "[0");
3722               else
3723                 {
3724                   dump_printf (MSG_NOTE, "(");
3725                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3726                 }
3727               dump_printf (MSG_NOTE, ", ");
3728               dump_dec (MSG_NOTE, lower_bound);
3729               dump_printf (MSG_NOTE, ")\n");
3730             }
3731           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3732                                   unsigned_p, lower_bound);
3733           continue;
3734         }
3735
3736       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3737       if (dr_group_first_a)
3738         {
3739           stmt_info_a = dr_group_first_a;
3740           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3741         }
3742
3743       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3744       if (dr_group_first_b)
3745         {
3746           stmt_info_b = dr_group_first_b;
3747           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3748         }
3749
3750       if (ignore_step_p)
3751         {
3752           segment_length_a = size_zero_node;
3753           segment_length_b = size_zero_node;
3754         }
3755       else
3756         {
3757           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3758                                 DR_STEP (dr_info_b->dr), 0))
3759             length_factor = scalar_loop_iters;
3760           else
3761             length_factor = size_int (vect_factor);
3762           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3763           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3764         }
3765       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3766       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3767       align_a = vect_vfa_align (dr_info_a);
3768       align_b = vect_vfa_align (dr_info_b);
3769
3770       /* See whether the alias is known at compilation time.  */
3771       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3772                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3773           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3774                               DR_OFFSET (dr_info_b->dr), 0)
3775           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3776           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3777           && poly_int_tree_p (segment_length_a)
3778           && poly_int_tree_p (segment_length_b))
3779         {
3780           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3781                                              segment_length_a,
3782                                              segment_length_b,
3783                                              access_size_a,
3784                                              access_size_b);
3785           if (res >= 0 && dump_enabled_p ())
3786             {
3787               dump_printf_loc (MSG_NOTE, vect_location,
3788                                "can tell at compile time that %T and %T",
3789                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3790               if (res == 0)
3791                 dump_printf (MSG_NOTE, " do not alias\n");
3792               else
3793                 dump_printf (MSG_NOTE, " alias\n");
3794             }
3795
3796           if (res == 0)
3797             continue;
3798
3799           if (res == 1)
3800             return opt_result::failure_at (stmt_info_b->stmt,
3801                                            "not vectorized:"
3802                                            " compilation time alias: %G%G",
3803                                            stmt_info_a->stmt,
3804                                            stmt_info_b->stmt);
3805         }
3806
3807       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3808                             access_size_a, align_a);
3809       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3810                             access_size_b, align_b);
3811       /* Canonicalize the order to be the one that's needed for accurate
3812          RAW, WAR and WAW flags, in cases where the data references are
3813          well-ordered.  The order doesn't really matter otherwise,
3814          but we might as well be consistent.  */
3815       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3816         std::swap (dr_a, dr_b);
3817
3818       dr_with_seg_len_pair_t dr_with_seg_len_pair
3819         (dr_a, dr_b, (preserves_scalar_order_p
3820                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3821                       : dr_with_seg_len_pair_t::REORDERED));
3822
3823       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3824     }
3825
3826   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3827
3828   unsigned int count = (comp_alias_ddrs.length ()
3829                         + check_unequal_addrs.length ());
3830
3831   if (count
3832       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3833           == VECT_COST_MODEL_VERY_CHEAP))
3834     return opt_result::failure_at
3835       (vect_location, "would need a runtime alias check\n");
3836
3837   if (dump_enabled_p ())
3838     dump_printf_loc (MSG_NOTE, vect_location,
3839                      "improved number of alias checks from %d to %d\n",
3840                      may_alias_ddrs.length (), count);
3841   unsigned limit = param_vect_max_version_for_alias_checks;
3842   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3843     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3844   if (count > limit)
3845     return opt_result::failure_at
3846       (vect_location,
3847        "number of versioning for alias run-time tests exceeds %d "
3848        "(--param vect-max-version-for-alias-checks)\n", limit);
3849
3850   return opt_result::success ();
3851 }
3852
3853 /* Check whether we can use an internal function for a gather load
3854    or scatter store.  READ_P is true for loads and false for stores.
3855    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3856    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3857    is the type of the offset that is being applied to the invariant
3858    base address.  SCALE is the amount by which the offset should
3859    be multiplied *after* it has been converted to address width.
3860
3861    Return true if the function is supported, storing the function id in
3862    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3863
3864 bool
3865 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3866                           tree vectype, tree memory_type, tree offset_type,
3867                           int scale, internal_fn *ifn_out,
3868                           tree *offset_vectype_out)
3869 {
3870   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3871   unsigned int element_bits = vector_element_bits (vectype);
3872   if (element_bits != memory_bits)
3873     /* For now the vector elements must be the same width as the
3874        memory elements.  */
3875     return false;
3876
3877   /* Work out which function we need.  */
3878   internal_fn ifn, alt_ifn;
3879   if (read_p)
3880     {
3881       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3882       alt_ifn = IFN_MASK_GATHER_LOAD;
3883     }
3884   else
3885     {
3886       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3887       alt_ifn = IFN_MASK_SCATTER_STORE;
3888     }
3889
3890   for (;;)
3891     {
3892       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3893       if (!offset_vectype)
3894         return false;
3895
3896       /* Test whether the target supports this combination.  */
3897       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3898                                                   offset_vectype, scale))
3899         {
3900           *ifn_out = ifn;
3901           *offset_vectype_out = offset_vectype;
3902           return true;
3903         }
3904       else if (!masked_p
3905                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3906                                                           memory_type,
3907                                                           offset_vectype,
3908                                                           scale))
3909         {
3910           *ifn_out = alt_ifn;
3911           *offset_vectype_out = offset_vectype;
3912           return true;
3913         }
3914
3915       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3916           && TYPE_PRECISION (offset_type) >= element_bits)
3917         return false;
3918
3919       offset_type = build_nonstandard_integer_type
3920         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3921     }
3922 }
3923
3924 /* STMT_INFO is a call to an internal gather load or scatter store function.
3925    Describe the operation in INFO.  */
3926
3927 static void
3928 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3929                                    gather_scatter_info *info)
3930 {
3931   gcall *call = as_a <gcall *> (stmt_info->stmt);
3932   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3933   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3934
3935   info->ifn = gimple_call_internal_fn (call);
3936   info->decl = NULL_TREE;
3937   info->base = gimple_call_arg (call, 0);
3938   info->offset = gimple_call_arg (call, 1);
3939   info->offset_dt = vect_unknown_def_type;
3940   info->offset_vectype = NULL_TREE;
3941   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3942   info->element_type = TREE_TYPE (vectype);
3943   info->memory_type = TREE_TYPE (DR_REF (dr));
3944 }
3945
3946 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3947    gather load or scatter store.  Describe the operation in *INFO if so.  */
3948
3949 bool
3950 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3951                            gather_scatter_info *info)
3952 {
3953   HOST_WIDE_INT scale = 1;
3954   poly_int64 pbitpos, pbitsize;
3955   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3956   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3957   tree offtype = NULL_TREE;
3958   tree decl = NULL_TREE, base, off;
3959   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3960   tree memory_type = TREE_TYPE (DR_REF (dr));
3961   machine_mode pmode;
3962   int punsignedp, reversep, pvolatilep = 0;
3963   internal_fn ifn;
3964   tree offset_vectype;
3965   bool masked_p = false;
3966
3967   /* See whether this is already a call to a gather/scatter internal function.
3968      If not, see whether it's a masked load or store.  */
3969   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3970   if (call && gimple_call_internal_p (call))
3971     {
3972       ifn = gimple_call_internal_fn (call);
3973       if (internal_gather_scatter_fn_p (ifn))
3974         {
3975           vect_describe_gather_scatter_call (stmt_info, info);
3976           return true;
3977         }
3978       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3979     }
3980
3981   /* True if we should aim to use internal functions rather than
3982      built-in functions.  */
3983   bool use_ifn_p = (DR_IS_READ (dr)
3984                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3985                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
3986
3987   base = DR_REF (dr);
3988   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3989      see if we can use the def stmt of the address.  */
3990   if (masked_p
3991       && TREE_CODE (base) == MEM_REF
3992       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3993       && integer_zerop (TREE_OPERAND (base, 1))
3994       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3995     {
3996       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3997       if (is_gimple_assign (def_stmt)
3998           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3999         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4000     }
4001
4002   /* The gather and scatter builtins need address of the form
4003      loop_invariant + vector * {1, 2, 4, 8}
4004      or
4005      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4006      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4007      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4008      multiplications and additions in it.  To get a vector, we need
4009      a single SSA_NAME that will be defined in the loop and will
4010      contain everything that is not loop invariant and that can be
4011      vectorized.  The following code attempts to find such a preexistng
4012      SSA_NAME OFF and put the loop invariants into a tree BASE
4013      that can be gimplified before the loop.  */
4014   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4015                               &punsignedp, &reversep, &pvolatilep);
4016   if (reversep)
4017     return false;
4018
4019   /* PR 107346.  Packed structs can have fields at offsets that are not
4020      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4021   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4022     return false;
4023
4024   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4025
4026   if (TREE_CODE (base) == MEM_REF)
4027     {
4028       if (!integer_zerop (TREE_OPERAND (base, 1)))
4029         {
4030           if (off == NULL_TREE)
4031             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4032           else
4033             off = size_binop (PLUS_EXPR, off,
4034                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4035         }
4036       base = TREE_OPERAND (base, 0);
4037     }
4038   else
4039     base = build_fold_addr_expr (base);
4040
4041   if (off == NULL_TREE)
4042     off = size_zero_node;
4043
4044   /* If base is not loop invariant, either off is 0, then we start with just
4045      the constant offset in the loop invariant BASE and continue with base
4046      as OFF, otherwise give up.
4047      We could handle that case by gimplifying the addition of base + off
4048      into some SSA_NAME and use that as off, but for now punt.  */
4049   if (!expr_invariant_in_loop_p (loop, base))
4050     {
4051       if (!integer_zerop (off))
4052         return false;
4053       off = base;
4054       base = size_int (pbytepos);
4055     }
4056   /* Otherwise put base + constant offset into the loop invariant BASE
4057      and continue with OFF.  */
4058   else
4059     {
4060       base = fold_convert (sizetype, base);
4061       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4062     }
4063
4064   /* OFF at this point may be either a SSA_NAME or some tree expression
4065      from get_inner_reference.  Try to peel off loop invariants from it
4066      into BASE as long as possible.  */
4067   STRIP_NOPS (off);
4068   while (offtype == NULL_TREE)
4069     {
4070       enum tree_code code;
4071       tree op0, op1, add = NULL_TREE;
4072
4073       if (TREE_CODE (off) == SSA_NAME)
4074         {
4075           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4076
4077           if (expr_invariant_in_loop_p (loop, off))
4078             return false;
4079
4080           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4081             break;
4082
4083           op0 = gimple_assign_rhs1 (def_stmt);
4084           code = gimple_assign_rhs_code (def_stmt);
4085           op1 = gimple_assign_rhs2 (def_stmt);
4086         }
4087       else
4088         {
4089           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4090             return false;
4091           code = TREE_CODE (off);
4092           extract_ops_from_tree (off, &code, &op0, &op1);
4093         }
4094       switch (code)
4095         {
4096         case POINTER_PLUS_EXPR:
4097         case PLUS_EXPR:
4098           if (expr_invariant_in_loop_p (loop, op0))
4099             {
4100               add = op0;
4101               off = op1;
4102             do_add:
4103               add = fold_convert (sizetype, add);
4104               if (scale != 1)
4105                 add = size_binop (MULT_EXPR, add, size_int (scale));
4106               base = size_binop (PLUS_EXPR, base, add);
4107               continue;
4108             }
4109           if (expr_invariant_in_loop_p (loop, op1))
4110             {
4111               add = op1;
4112               off = op0;
4113               goto do_add;
4114             }
4115           break;
4116         case MINUS_EXPR:
4117           if (expr_invariant_in_loop_p (loop, op1))
4118             {
4119               add = fold_convert (sizetype, op1);
4120               add = size_binop (MINUS_EXPR, size_zero_node, add);
4121               off = op0;
4122               goto do_add;
4123             }
4124           break;
4125         case MULT_EXPR:
4126           if (scale == 1 && tree_fits_shwi_p (op1))
4127             {
4128               int new_scale = tree_to_shwi (op1);
4129               /* Only treat this as a scaling operation if the target
4130                  supports it for at least some offset type.  */
4131               if (use_ifn_p
4132                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4133                                                 masked_p, vectype, memory_type,
4134                                                 signed_char_type_node,
4135                                                 new_scale, &ifn,
4136                                                 &offset_vectype)
4137                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4138                                                 masked_p, vectype, memory_type,
4139                                                 unsigned_char_type_node,
4140                                                 new_scale, &ifn,
4141                                                 &offset_vectype))
4142                 break;
4143               scale = new_scale;
4144               off = op0;
4145               continue;
4146             }
4147           break;
4148         case SSA_NAME:
4149           off = op0;
4150           continue;
4151         CASE_CONVERT:
4152           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4153               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4154             break;
4155
4156           /* Don't include the conversion if the target is happy with
4157              the current offset type.  */
4158           if (use_ifn_p
4159               && TREE_CODE (off) == SSA_NAME
4160               && !POINTER_TYPE_P (TREE_TYPE (off))
4161               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4162                                            masked_p, vectype, memory_type,
4163                                            TREE_TYPE (off), scale, &ifn,
4164                                            &offset_vectype))
4165             break;
4166
4167           if (TYPE_PRECISION (TREE_TYPE (op0))
4168               == TYPE_PRECISION (TREE_TYPE (off)))
4169             {
4170               off = op0;
4171               continue;
4172             }
4173
4174           /* Include the conversion if it is widening and we're using
4175              the IFN path or the target can handle the converted from
4176              offset or the current size is not already the same as the
4177              data vector element size.  */
4178           if ((TYPE_PRECISION (TREE_TYPE (op0))
4179                < TYPE_PRECISION (TREE_TYPE (off)))
4180               && (use_ifn_p
4181                   || (DR_IS_READ (dr)
4182                       ? (targetm.vectorize.builtin_gather
4183                          && targetm.vectorize.builtin_gather (vectype,
4184                                                               TREE_TYPE (op0),
4185                                                               scale))
4186                       : (targetm.vectorize.builtin_scatter
4187                          && targetm.vectorize.builtin_scatter (vectype,
4188                                                                TREE_TYPE (op0),
4189                                                                scale)))
4190                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4191                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4192             {
4193               off = op0;
4194               offtype = TREE_TYPE (off);
4195               STRIP_NOPS (off);
4196               continue;
4197             }
4198           break;
4199         default:
4200           break;
4201         }
4202       break;
4203     }
4204
4205   /* If at the end OFF still isn't a SSA_NAME or isn't
4206      defined in the loop, punt.  */
4207   if (TREE_CODE (off) != SSA_NAME
4208       || expr_invariant_in_loop_p (loop, off))
4209     return false;
4210
4211   if (offtype == NULL_TREE)
4212     offtype = TREE_TYPE (off);
4213
4214   if (use_ifn_p)
4215     {
4216       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4217                                      vectype, memory_type, offtype, scale,
4218                                      &ifn, &offset_vectype))
4219         ifn = IFN_LAST;
4220       decl = NULL_TREE;
4221     }
4222   else
4223     {
4224       if (DR_IS_READ (dr))
4225         {
4226           if (targetm.vectorize.builtin_gather)
4227             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4228         }
4229       else
4230         {
4231           if (targetm.vectorize.builtin_scatter)
4232             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4233         }
4234       ifn = IFN_LAST;
4235       /* The offset vector type will be read from DECL when needed.  */
4236       offset_vectype = NULL_TREE;
4237     }
4238
4239   info->ifn = ifn;
4240   info->decl = decl;
4241   info->base = base;
4242   info->offset = off;
4243   info->offset_dt = vect_unknown_def_type;
4244   info->offset_vectype = offset_vectype;
4245   info->scale = scale;
4246   info->element_type = TREE_TYPE (vectype);
4247   info->memory_type = memory_type;
4248   return true;
4249 }
4250
4251 /* Find the data references in STMT, analyze them with respect to LOOP and
4252    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4253    be handled.  */
4254
4255 opt_result
4256 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4257                                vec<data_reference_p> *datarefs,
4258                                vec<int> *dataref_groups, int group_id)
4259 {
4260   /* We can ignore clobbers for dataref analysis - they are removed during
4261      loop vectorization and BB vectorization checks dependences with a
4262      stmt walk.  */
4263   if (gimple_clobber_p (stmt))
4264     return opt_result::success ();
4265
4266   if (gimple_has_volatile_ops (stmt))
4267     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4268                                    stmt);
4269
4270   if (stmt_can_throw_internal (cfun, stmt))
4271     return opt_result::failure_at (stmt,
4272                                    "not vectorized:"
4273                                    " statement can throw an exception: %G",
4274                                    stmt);
4275
4276   auto_vec<data_reference_p, 2> refs;
4277   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4278   if (!res)
4279     return res;
4280
4281   if (refs.is_empty ())
4282     return opt_result::success ();
4283
4284   if (refs.length () > 1)
4285     {
4286       while (!refs.is_empty ())
4287         free_data_ref (refs.pop ());
4288       return opt_result::failure_at (stmt,
4289                                      "not vectorized: more than one "
4290                                      "data ref in stmt: %G", stmt);
4291     }
4292
4293   data_reference_p dr = refs.pop ();
4294   if (gcall *call = dyn_cast <gcall *> (stmt))
4295     if (!gimple_call_internal_p (call)
4296         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4297             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4298       {
4299         free_data_ref (dr);
4300         return opt_result::failure_at (stmt,
4301                                        "not vectorized: dr in a call %G", stmt);
4302       }
4303
4304   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4305       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4306     {
4307       free_data_ref (dr);
4308       return opt_result::failure_at (stmt,
4309                                      "not vectorized:"
4310                                      " statement is an unsupported"
4311                                      " bitfield access %G", stmt);
4312     }
4313
4314   if (DR_BASE_ADDRESS (dr)
4315       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4316     {
4317       free_data_ref (dr);
4318       return opt_result::failure_at (stmt,
4319                                      "not vectorized:"
4320                                      " base addr of dr is a constant\n");
4321     }
4322
4323   /* Check whether this may be a SIMD lane access and adjust the
4324      DR to make it easier for us to handle it.  */
4325   if (loop
4326       && loop->simduid
4327       && (!DR_BASE_ADDRESS (dr)
4328           || !DR_OFFSET (dr)
4329           || !DR_INIT (dr)
4330           || !DR_STEP (dr)))
4331     {
4332       struct data_reference *newdr
4333         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4334                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4335       if (DR_BASE_ADDRESS (newdr)
4336           && DR_OFFSET (newdr)
4337           && DR_INIT (newdr)
4338           && DR_STEP (newdr)
4339           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4340           && integer_zerop (DR_STEP (newdr)))
4341         {
4342           tree base_address = DR_BASE_ADDRESS (newdr);
4343           tree off = DR_OFFSET (newdr);
4344           tree step = ssize_int (1);
4345           if (integer_zerop (off)
4346               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4347             {
4348               off = TREE_OPERAND (base_address, 1);
4349               base_address = TREE_OPERAND (base_address, 0);
4350             }
4351           STRIP_NOPS (off);
4352           if (TREE_CODE (off) == MULT_EXPR
4353               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4354             {
4355               step = TREE_OPERAND (off, 1);
4356               off = TREE_OPERAND (off, 0);
4357               STRIP_NOPS (off);
4358             }
4359           if (CONVERT_EXPR_P (off)
4360               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4361                   < TYPE_PRECISION (TREE_TYPE (off))))
4362             off = TREE_OPERAND (off, 0);
4363           if (TREE_CODE (off) == SSA_NAME)
4364             {
4365               gimple *def = SSA_NAME_DEF_STMT (off);
4366               /* Look through widening conversion.  */
4367               if (is_gimple_assign (def)
4368                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4369                 {
4370                   tree rhs1 = gimple_assign_rhs1 (def);
4371                   if (TREE_CODE (rhs1) == SSA_NAME
4372                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4373                       && (TYPE_PRECISION (TREE_TYPE (off))
4374                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4375                     def = SSA_NAME_DEF_STMT (rhs1);
4376                 }
4377               if (is_gimple_call (def)
4378                   && gimple_call_internal_p (def)
4379                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4380                 {
4381                   tree arg = gimple_call_arg (def, 0);
4382                   tree reft = TREE_TYPE (DR_REF (newdr));
4383                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4384                   arg = SSA_NAME_VAR (arg);
4385                   if (arg == loop->simduid
4386                       /* For now.  */
4387                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4388                     {
4389                       DR_BASE_ADDRESS (newdr) = base_address;
4390                       DR_OFFSET (newdr) = ssize_int (0);
4391                       DR_STEP (newdr) = step;
4392                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4393                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4394                       /* Mark as simd-lane access.  */
4395                       tree arg2 = gimple_call_arg (def, 1);
4396                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4397                       free_data_ref (dr);
4398                       datarefs->safe_push (newdr);
4399                       if (dataref_groups)
4400                         dataref_groups->safe_push (group_id);
4401                       return opt_result::success ();
4402                     }
4403                 }
4404             }
4405         }
4406       free_data_ref (newdr);
4407     }
4408
4409   datarefs->safe_push (dr);
4410   if (dataref_groups)
4411     dataref_groups->safe_push (group_id);
4412   return opt_result::success ();
4413 }
4414
4415 /* Function vect_analyze_data_refs.
4416
4417   Find all the data references in the loop or basic block.
4418
4419    The general structure of the analysis of data refs in the vectorizer is as
4420    follows:
4421    1- vect_analyze_data_refs(loop/bb): call
4422       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4423       in the loop/bb and their dependences.
4424    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4425    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4426    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4427
4428 */
4429
4430 opt_result
4431 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4432 {
4433   class loop *loop = NULL;
4434   unsigned int i;
4435   struct data_reference *dr;
4436   tree scalar_type;
4437
4438   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4439
4440   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4441     loop = LOOP_VINFO_LOOP (loop_vinfo);
4442
4443   /* Go through the data-refs, check that the analysis succeeded.  Update
4444      pointer from stmt_vec_info struct to DR and vectype.  */
4445
4446   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4447   FOR_EACH_VEC_ELT (datarefs, i, dr)
4448     {
4449       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4450       poly_uint64 vf;
4451
4452       gcc_assert (DR_REF (dr));
4453       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4454       gcc_assert (!stmt_info->dr_aux.dr);
4455       stmt_info->dr_aux.dr = dr;
4456       stmt_info->dr_aux.stmt = stmt_info;
4457
4458       /* Check that analysis of the data-ref succeeded.  */
4459       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4460           || !DR_STEP (dr))
4461         {
4462           bool maybe_gather
4463             = DR_IS_READ (dr)
4464               && !TREE_THIS_VOLATILE (DR_REF (dr));
4465           bool maybe_scatter
4466             = DR_IS_WRITE (dr)
4467               && !TREE_THIS_VOLATILE (DR_REF (dr));
4468
4469           /* If target supports vector gather loads or scatter stores,
4470              see if they can't be used.  */
4471           if (is_a <loop_vec_info> (vinfo)
4472               && !nested_in_vect_loop_p (loop, stmt_info))
4473             {
4474               if (maybe_gather || maybe_scatter)
4475                 {
4476                   if (maybe_gather)
4477                     gatherscatter = GATHER;
4478                   else
4479                     gatherscatter = SCATTER;
4480                 }
4481             }
4482
4483           if (gatherscatter == SG_NONE)
4484             {
4485               if (dump_enabled_p ())
4486                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4487                                  "not vectorized: data ref analysis "
4488                                  "failed %G", stmt_info->stmt);
4489               if (is_a <bb_vec_info> (vinfo))
4490                 {
4491                   /* In BB vectorization the ref can still participate
4492                      in dependence analysis, we just can't vectorize it.  */
4493                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4494                   continue;
4495                 }
4496               return opt_result::failure_at (stmt_info->stmt,
4497                                              "not vectorized:"
4498                                              " data ref analysis failed: %G",
4499                                              stmt_info->stmt);
4500             }
4501         }
4502
4503       /* See if this was detected as SIMD lane access.  */
4504       if (dr->aux == (void *)-1
4505           || dr->aux == (void *)-2
4506           || dr->aux == (void *)-3
4507           || dr->aux == (void *)-4)
4508         {
4509           if (nested_in_vect_loop_p (loop, stmt_info))
4510             return opt_result::failure_at (stmt_info->stmt,
4511                                            "not vectorized:"
4512                                            " data ref analysis failed: %G",
4513                                            stmt_info->stmt);
4514           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4515             = -(uintptr_t) dr->aux;
4516         }
4517
4518       tree base = get_base_address (DR_REF (dr));
4519       if (base && VAR_P (base) && DECL_NONALIASED (base))
4520         {
4521           if (dump_enabled_p ())
4522             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4523                              "not vectorized: base object not addressable "
4524                              "for stmt: %G", stmt_info->stmt);
4525           if (is_a <bb_vec_info> (vinfo))
4526             {
4527               /* In BB vectorization the ref can still participate
4528                  in dependence analysis, we just can't vectorize it.  */
4529               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4530               continue;
4531             }
4532           return opt_result::failure_at (stmt_info->stmt,
4533                                          "not vectorized: base object not"
4534                                          " addressable for stmt: %G",
4535                                          stmt_info->stmt);
4536         }
4537
4538       if (is_a <loop_vec_info> (vinfo)
4539           && DR_STEP (dr)
4540           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4541         {
4542           if (nested_in_vect_loop_p (loop, stmt_info))
4543             return opt_result::failure_at (stmt_info->stmt,
4544                                            "not vectorized: "
4545                                            "not suitable for strided load %G",
4546                                            stmt_info->stmt);
4547           STMT_VINFO_STRIDED_P (stmt_info) = true;
4548         }
4549
4550       /* Update DR field in stmt_vec_info struct.  */
4551
4552       /* If the dataref is in an inner-loop of the loop that is considered for
4553          for vectorization, we also want to analyze the access relative to
4554          the outer-loop (DR contains information only relative to the
4555          inner-most enclosing loop).  We do that by building a reference to the
4556          first location accessed by the inner-loop, and analyze it relative to
4557          the outer-loop.  */
4558       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4559         {
4560           /* Build a reference to the first location accessed by the
4561              inner loop: *(BASE + INIT + OFFSET).  By construction,
4562              this address must be invariant in the inner loop, so we
4563              can consider it as being used in the outer loop.  */
4564           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4565           tree offset = unshare_expr (DR_OFFSET (dr));
4566           tree init = unshare_expr (DR_INIT (dr));
4567           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4568                                           init, offset);
4569           tree init_addr = fold_build_pointer_plus (base, init_offset);
4570           tree init_ref = build_fold_indirect_ref (init_addr);
4571
4572           if (dump_enabled_p ())
4573             dump_printf_loc (MSG_NOTE, vect_location,
4574                              "analyze in outer loop: %T\n", init_ref);
4575
4576           opt_result res
4577             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4578                                     init_ref, loop, stmt_info->stmt);
4579           if (!res)
4580             /* dr_analyze_innermost already explained the failure.  */
4581             return res;
4582
4583           if (dump_enabled_p ())
4584             dump_printf_loc (MSG_NOTE, vect_location,
4585                              "\touter base_address: %T\n"
4586                              "\touter offset from base address: %T\n"
4587                              "\touter constant offset from base address: %T\n"
4588                              "\touter step: %T\n"
4589                              "\touter base alignment: %d\n\n"
4590                              "\touter base misalignment: %d\n"
4591                              "\touter offset alignment: %d\n"
4592                              "\touter step alignment: %d\n",
4593                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4594                              STMT_VINFO_DR_OFFSET (stmt_info),
4595                              STMT_VINFO_DR_INIT (stmt_info),
4596                              STMT_VINFO_DR_STEP (stmt_info),
4597                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4598                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4599                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4600                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4601         }
4602
4603       /* Set vectype for STMT.  */
4604       scalar_type = TREE_TYPE (DR_REF (dr));
4605       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4606       if (!vectype)
4607         {
4608           if (dump_enabled_p ())
4609             {
4610               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4611                                "not vectorized: no vectype for stmt: %G",
4612                                stmt_info->stmt);
4613               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4614               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4615                                  scalar_type);
4616               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4617             }
4618
4619           if (is_a <bb_vec_info> (vinfo))
4620             {
4621               /* No vector type is fine, the ref can still participate
4622                  in dependence analysis, we just can't vectorize it.  */
4623               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4624               continue;
4625             }
4626           if (fatal)
4627             *fatal = false;
4628           return opt_result::failure_at (stmt_info->stmt,
4629                                          "not vectorized:"
4630                                          " no vectype for stmt: %G"
4631                                          " scalar_type: %T\n",
4632                                          stmt_info->stmt, scalar_type);
4633         }
4634       else
4635         {
4636           if (dump_enabled_p ())
4637             dump_printf_loc (MSG_NOTE, vect_location,
4638                              "got vectype for stmt: %G%T\n",
4639                              stmt_info->stmt, vectype);
4640         }
4641
4642       /* Adjust the minimal vectorization factor according to the
4643          vector type.  */
4644       vf = TYPE_VECTOR_SUBPARTS (vectype);
4645       *min_vf = upper_bound (*min_vf, vf);
4646
4647       /* Leave the BB vectorizer to pick the vector type later, based on
4648          the final dataref group size and SLP node size.  */
4649       if (is_a <loop_vec_info> (vinfo))
4650         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4651
4652       if (gatherscatter != SG_NONE)
4653         {
4654           gather_scatter_info gs_info;
4655           if (!vect_check_gather_scatter (stmt_info,
4656                                           as_a <loop_vec_info> (vinfo),
4657                                           &gs_info)
4658               || !get_vectype_for_scalar_type (vinfo,
4659                                                TREE_TYPE (gs_info.offset)))
4660             {
4661               if (fatal)
4662                 *fatal = false;
4663               return opt_result::failure_at
4664                         (stmt_info->stmt,
4665                          (gatherscatter == GATHER)
4666                          ? "not vectorized: not suitable for gather load %G"
4667                          : "not vectorized: not suitable for scatter store %G",
4668                          stmt_info->stmt);
4669             }
4670           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4671         }
4672     }
4673
4674   /* We used to stop processing and prune the list here.  Verify we no
4675      longer need to.  */
4676   gcc_assert (i == datarefs.length ());
4677
4678   return opt_result::success ();
4679 }
4680
4681
4682 /* Function vect_get_new_vect_var.
4683
4684    Returns a name for a new variable.  The current naming scheme appends the
4685    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4686    the name of vectorizer generated variables, and appends that to NAME if
4687    provided.  */
4688
4689 tree
4690 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4691 {
4692   const char *prefix;
4693   tree new_vect_var;
4694
4695   switch (var_kind)
4696   {
4697   case vect_simple_var:
4698     prefix = "vect";
4699     break;
4700   case vect_scalar_var:
4701     prefix = "stmp";
4702     break;
4703   case vect_mask_var:
4704     prefix = "mask";
4705     break;
4706   case vect_pointer_var:
4707     prefix = "vectp";
4708     break;
4709   default:
4710     gcc_unreachable ();
4711   }
4712
4713   if (name)
4714     {
4715       char* tmp = concat (prefix, "_", name, NULL);
4716       new_vect_var = create_tmp_reg (type, tmp);
4717       free (tmp);
4718     }
4719   else
4720     new_vect_var = create_tmp_reg (type, prefix);
4721
4722   return new_vect_var;
4723 }
4724
4725 /* Like vect_get_new_vect_var but return an SSA name.  */
4726
4727 tree
4728 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4729 {
4730   const char *prefix;
4731   tree new_vect_var;
4732
4733   switch (var_kind)
4734   {
4735   case vect_simple_var:
4736     prefix = "vect";
4737     break;
4738   case vect_scalar_var:
4739     prefix = "stmp";
4740     break;
4741   case vect_pointer_var:
4742     prefix = "vectp";
4743     break;
4744   default:
4745     gcc_unreachable ();
4746   }
4747
4748   if (name)
4749     {
4750       char* tmp = concat (prefix, "_", name, NULL);
4751       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4752       free (tmp);
4753     }
4754   else
4755     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4756
4757   return new_vect_var;
4758 }
4759
4760 /* Duplicate points-to info on NAME from DR_INFO.  */
4761
4762 static void
4763 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4764 {
4765   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4766   /* DR_PTR_INFO is for a base SSA name, not including constant or
4767      variable offsets in the ref so its alignment info does not apply.  */
4768   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4769 }
4770
4771 /* Function vect_create_addr_base_for_vector_ref.
4772
4773    Create an expression that computes the address of the first memory location
4774    that will be accessed for a data reference.
4775
4776    Input:
4777    STMT_INFO: The statement containing the data reference.
4778    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4779    OFFSET: Optional. If supplied, it is be added to the initial address.
4780    LOOP:    Specify relative to which loop-nest should the address be computed.
4781             For example, when the dataref is in an inner-loop nested in an
4782             outer-loop that is now being vectorized, LOOP can be either the
4783             outer-loop, or the inner-loop.  The first memory location accessed
4784             by the following dataref ('in' points to short):
4785
4786                 for (i=0; i<N; i++)
4787                    for (j=0; j<M; j++)
4788                      s += in[i+j]
4789
4790             is as follows:
4791             if LOOP=i_loop:     &in             (relative to i_loop)
4792             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4793
4794    Output:
4795    1. Return an SSA_NAME whose value is the address of the memory location of
4796       the first vector of the data reference.
4797    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4798       these statement(s) which define the returned SSA_NAME.
4799
4800    FORNOW: We are only handling array accesses with step 1.  */
4801
4802 tree
4803 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4804                                       gimple_seq *new_stmt_list,
4805                                       tree offset)
4806 {
4807   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4808   struct data_reference *dr = dr_info->dr;
4809   const char *base_name;
4810   tree addr_base;
4811   tree dest;
4812   gimple_seq seq = NULL;
4813   tree vect_ptr_type;
4814   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4815   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4816
4817   tree data_ref_base = unshare_expr (drb->base_address);
4818   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4819   tree init = unshare_expr (drb->init);
4820
4821   if (loop_vinfo)
4822     base_name = get_name (data_ref_base);
4823   else
4824     {
4825       base_offset = ssize_int (0);
4826       init = ssize_int (0);
4827       base_name = get_name (DR_REF (dr));
4828     }
4829
4830   /* Create base_offset */
4831   base_offset = size_binop (PLUS_EXPR,
4832                             fold_convert (sizetype, base_offset),
4833                             fold_convert (sizetype, init));
4834
4835   if (offset)
4836     {
4837       offset = fold_convert (sizetype, offset);
4838       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4839                                  base_offset, offset);
4840     }
4841
4842   /* base + base_offset */
4843   if (loop_vinfo)
4844     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4845   else
4846     addr_base = build1 (ADDR_EXPR,
4847                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
4848                         /* Strip zero offset components since we don't need
4849                            them and they can confuse late diagnostics if
4850                            we CSE them wrongly.  See PR106904 for example.  */
4851                         unshare_expr (strip_zero_offset_components
4852                                                                 (DR_REF (dr))));
4853
4854   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4855   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4856   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4857   gimple_seq_add_seq (new_stmt_list, seq);
4858
4859   if (DR_PTR_INFO (dr)
4860       && TREE_CODE (addr_base) == SSA_NAME
4861       /* We should only duplicate pointer info to newly created SSA names.  */
4862       && SSA_NAME_VAR (addr_base) == dest)
4863     {
4864       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4865       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4866     }
4867
4868   if (dump_enabled_p ())
4869     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4870
4871   return addr_base;
4872 }
4873
4874
4875 /* Function vect_create_data_ref_ptr.
4876
4877    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4878    location accessed in the loop by STMT_INFO, along with the def-use update
4879    chain to appropriately advance the pointer through the loop iterations.
4880    Also set aliasing information for the pointer.  This pointer is used by
4881    the callers to this function to create a memory reference expression for
4882    vector load/store access.
4883
4884    Input:
4885    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4886          GIMPLE_ASSIGN <name, data-ref> or
4887          GIMPLE_ASSIGN <data-ref, name>.
4888    2. AGGR_TYPE: the type of the reference, which should be either a vector
4889         or an array.
4890    3. AT_LOOP: the loop where the vector memref is to be created.
4891    4. OFFSET (optional): a byte offset to be added to the initial address
4892         accessed by the data-ref in STMT_INFO.
4893    5. BSI: location where the new stmts are to be placed if there is no loop
4894    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4895         pointing to the initial address.
4896    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4897         to the IV during each iteration of the loop.  NULL says to move
4898         by one copy of AGGR_TYPE up or down, depending on the step of the
4899         data reference.
4900
4901    Output:
4902    1. Declare a new ptr to vector_type, and have it point to the base of the
4903       data reference (initial addressed accessed by the data reference).
4904       For example, for vector of type V8HI, the following code is generated:
4905
4906       v8hi *ap;
4907       ap = (v8hi *)initial_address;
4908
4909       if OFFSET is not supplied:
4910          initial_address = &a[init];
4911       if OFFSET is supplied:
4912          initial_address = &a[init] + OFFSET;
4913       if BYTE_OFFSET is supplied:
4914          initial_address = &a[init] + BYTE_OFFSET;
4915
4916       Return the initial_address in INITIAL_ADDRESS.
4917
4918    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4919       update the pointer in each iteration of the loop.
4920
4921       Return the increment stmt that updates the pointer in PTR_INCR.
4922
4923    3. Return the pointer.  */
4924
4925 tree
4926 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4927                           tree aggr_type, class loop *at_loop, tree offset,
4928                           tree *initial_address, gimple_stmt_iterator *gsi,
4929                           gimple **ptr_incr, bool only_init,
4930                           tree iv_step)
4931 {
4932   const char *base_name;
4933   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4934   class loop *loop = NULL;
4935   bool nested_in_vect_loop = false;
4936   class loop *containing_loop = NULL;
4937   tree aggr_ptr_type;
4938   tree aggr_ptr;
4939   tree new_temp;
4940   gimple_seq new_stmt_list = NULL;
4941   edge pe = NULL;
4942   basic_block new_bb;
4943   tree aggr_ptr_init;
4944   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4945   struct data_reference *dr = dr_info->dr;
4946   tree aptr;
4947   gimple_stmt_iterator incr_gsi;
4948   bool insert_after;
4949   tree indx_before_incr, indx_after_incr;
4950   gimple *incr;
4951   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4952
4953   gcc_assert (iv_step != NULL_TREE
4954               || TREE_CODE (aggr_type) == ARRAY_TYPE
4955               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4956
4957   if (loop_vinfo)
4958     {
4959       loop = LOOP_VINFO_LOOP (loop_vinfo);
4960       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4961       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4962       pe = loop_preheader_edge (loop);
4963     }
4964   else
4965     {
4966       gcc_assert (bb_vinfo);
4967       only_init = true;
4968       *ptr_incr = NULL;
4969     }
4970
4971   /* Create an expression for the first address accessed by this load
4972      in LOOP.  */
4973   base_name = get_name (DR_BASE_ADDRESS (dr));
4974
4975   if (dump_enabled_p ())
4976     {
4977       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4978       dump_printf_loc (MSG_NOTE, vect_location,
4979                        "create %s-pointer variable to type: %T",
4980                        get_tree_code_name (TREE_CODE (aggr_type)),
4981                        aggr_type);
4982       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4983         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4984       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4985         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4986       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4987         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4988       else
4989         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4990       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4991     }
4992
4993   /* (1) Create the new aggregate-pointer variable.
4994      Vector and array types inherit the alias set of their component
4995      type by default so we need to use a ref-all pointer if the data
4996      reference does not conflict with the created aggregated data
4997      reference because it is not addressable.  */
4998   bool need_ref_all = false;
4999   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5000                               get_alias_set (DR_REF (dr))))
5001     need_ref_all = true;
5002   /* Likewise for any of the data references in the stmt group.  */
5003   else if (DR_GROUP_SIZE (stmt_info) > 1)
5004     {
5005       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5006       do
5007         {
5008           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5009           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5010                                       get_alias_set (DR_REF (sdr))))
5011             {
5012               need_ref_all = true;
5013               break;
5014             }
5015           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5016         }
5017       while (sinfo);
5018     }
5019   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5020                                                need_ref_all);
5021   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5022
5023
5024   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5025      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5026      def-use update cycles for the pointer: one relative to the outer-loop
5027      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5028      to the inner-loop (which is the inner-most loop containing the dataref),
5029      and this is done be step (5) below.
5030
5031      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5032      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5033      redundant.  Steps (3),(4) create the following:
5034
5035         vp0 = &base_addr;
5036         LOOP:   vp1 = phi(vp0,vp2)
5037                 ...
5038                 ...
5039                 vp2 = vp1 + step
5040                 goto LOOP
5041
5042      If there is an inner-loop nested in loop, then step (5) will also be
5043      applied, and an additional update in the inner-loop will be created:
5044
5045         vp0 = &base_addr;
5046         LOOP:   vp1 = phi(vp0,vp2)
5047                 ...
5048         inner:     vp3 = phi(vp1,vp4)
5049                    vp4 = vp3 + inner_step
5050                    if () goto inner
5051                 ...
5052                 vp2 = vp1 + step
5053                 if () goto LOOP   */
5054
5055   /* (2) Calculate the initial address of the aggregate-pointer, and set
5056      the aggregate-pointer to point to it before the loop.  */
5057
5058   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5059
5060   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5061                                                    stmt_info, &new_stmt_list,
5062                                                    offset);
5063   if (new_stmt_list)
5064     {
5065       if (pe)
5066         {
5067           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5068           gcc_assert (!new_bb);
5069         }
5070       else
5071         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5072     }
5073
5074   *initial_address = new_temp;
5075   aggr_ptr_init = new_temp;
5076
5077   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5078      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5079      inner-loop nested in LOOP (during outer-loop vectorization).  */
5080
5081   /* No update in loop is required.  */
5082   if (only_init && (!loop_vinfo || at_loop == loop))
5083     aptr = aggr_ptr_init;
5084   else
5085     {
5086       /* Accesses to invariant addresses should be handled specially
5087          by the caller.  */
5088       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5089       gcc_assert (!integer_zerop (step));
5090
5091       if (iv_step == NULL_TREE)
5092         {
5093           /* The step of the aggregate pointer is the type size,
5094              negated for downward accesses.  */
5095           iv_step = TYPE_SIZE_UNIT (aggr_type);
5096           if (tree_int_cst_sgn (step) == -1)
5097             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5098         }
5099
5100       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5101
5102       create_iv (aggr_ptr_init, PLUS_EXPR,
5103                  fold_convert (aggr_ptr_type, iv_step),
5104                  aggr_ptr, loop, &incr_gsi, insert_after,
5105                  &indx_before_incr, &indx_after_incr);
5106       incr = gsi_stmt (incr_gsi);
5107
5108       /* Copy the points-to information if it exists. */
5109       if (DR_PTR_INFO (dr))
5110         {
5111           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5112           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5113         }
5114       if (ptr_incr)
5115         *ptr_incr = incr;
5116
5117       aptr = indx_before_incr;
5118     }
5119
5120   if (!nested_in_vect_loop || only_init)
5121     return aptr;
5122
5123
5124   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5125      nested in LOOP, if exists.  */
5126
5127   gcc_assert (nested_in_vect_loop);
5128   if (!only_init)
5129     {
5130       standard_iv_increment_position (containing_loop, &incr_gsi,
5131                                       &insert_after);
5132       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5133                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5134                  &indx_before_incr, &indx_after_incr);
5135       incr = gsi_stmt (incr_gsi);
5136
5137       /* Copy the points-to information if it exists. */
5138       if (DR_PTR_INFO (dr))
5139         {
5140           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5141           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5142         }
5143       if (ptr_incr)
5144         *ptr_incr = incr;
5145
5146       return indx_before_incr;
5147     }
5148   else
5149     gcc_unreachable ();
5150 }
5151
5152
5153 /* Function bump_vector_ptr
5154
5155    Increment a pointer (to a vector type) by vector-size. If requested,
5156    i.e. if PTR-INCR is given, then also connect the new increment stmt
5157    to the existing def-use update-chain of the pointer, by modifying
5158    the PTR_INCR as illustrated below:
5159
5160    The pointer def-use update-chain before this function:
5161                         DATAREF_PTR = phi (p_0, p_2)
5162                         ....
5163         PTR_INCR:       p_2 = DATAREF_PTR + step
5164
5165    The pointer def-use update-chain after this function:
5166                         DATAREF_PTR = phi (p_0, p_2)
5167                         ....
5168                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5169                         ....
5170         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5171
5172    Input:
5173    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5174                  in the loop.
5175    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5176               the loop.  The increment amount across iterations is expected
5177               to be vector_size.
5178    BSI - location where the new update stmt is to be placed.
5179    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5180    BUMP - optional. The offset by which to bump the pointer. If not given,
5181           the offset is assumed to be vector_size.
5182
5183    Output: Return NEW_DATAREF_PTR as illustrated above.
5184
5185 */
5186
5187 tree
5188 bump_vector_ptr (vec_info *vinfo,
5189                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5190                  stmt_vec_info stmt_info, tree bump)
5191 {
5192   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5193   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5194   tree update = TYPE_SIZE_UNIT (vectype);
5195   gimple *incr_stmt;
5196   ssa_op_iter iter;
5197   use_operand_p use_p;
5198   tree new_dataref_ptr;
5199
5200   if (bump)
5201     update = bump;
5202
5203   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5204     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5205   else if (is_gimple_min_invariant (dataref_ptr))
5206     /* When possible avoid emitting a separate increment stmt that will
5207        force the addressed object addressable.  */
5208     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5209                    fold_build2 (MEM_REF,
5210                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5211                                 dataref_ptr,
5212                                 fold_convert (ptr_type_node, update)));
5213   else
5214     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5215   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5216                                    dataref_ptr, update);
5217   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5218   /* Fold the increment, avoiding excessive chains use-def chains of
5219      those, leading to compile-time issues for passes until the next
5220      forwprop pass which would do this as well.  */
5221   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5222   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5223     {
5224       incr_stmt = gsi_stmt (fold_gsi);
5225       update_stmt (incr_stmt);
5226     }
5227
5228   /* Copy the points-to information if it exists. */
5229   if (DR_PTR_INFO (dr))
5230     {
5231       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5232       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5233     }
5234
5235   if (!ptr_incr)
5236     return new_dataref_ptr;
5237
5238   /* Update the vector-pointer's cross-iteration increment.  */
5239   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5240     {
5241       tree use = USE_FROM_PTR (use_p);
5242
5243       if (use == dataref_ptr)
5244         SET_USE (use_p, new_dataref_ptr);
5245       else
5246         gcc_assert (operand_equal_p (use, update, 0));
5247     }
5248
5249   return new_dataref_ptr;
5250 }
5251
5252
5253 /* Copy memory reference info such as base/clique from the SRC reference
5254    to the DEST MEM_REF.  */
5255
5256 void
5257 vect_copy_ref_info (tree dest, tree src)
5258 {
5259   if (TREE_CODE (dest) != MEM_REF)
5260     return;
5261
5262   tree src_base = src;
5263   while (handled_component_p (src_base))
5264     src_base = TREE_OPERAND (src_base, 0);
5265   if (TREE_CODE (src_base) != MEM_REF
5266       && TREE_CODE (src_base) != TARGET_MEM_REF)
5267     return;
5268
5269   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5270   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5271 }
5272
5273
5274 /* Function vect_create_destination_var.
5275
5276    Create a new temporary of type VECTYPE.  */
5277
5278 tree
5279 vect_create_destination_var (tree scalar_dest, tree vectype)
5280 {
5281   tree vec_dest;
5282   const char *name;
5283   char *new_name;
5284   tree type;
5285   enum vect_var_kind kind;
5286
5287   kind = vectype
5288     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5289     ? vect_mask_var
5290     : vect_simple_var
5291     : vect_scalar_var;
5292   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5293
5294   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5295
5296   name = get_name (scalar_dest);
5297   if (name)
5298     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5299   else
5300     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5301   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5302   free (new_name);
5303
5304   return vec_dest;
5305 }
5306
5307 /* Function vect_grouped_store_supported.
5308
5309    Returns TRUE if interleave high and interleave low permutations
5310    are supported, and FALSE otherwise.  */
5311
5312 bool
5313 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5314 {
5315   machine_mode mode = TYPE_MODE (vectype);
5316
5317   /* vect_permute_store_chain requires the group size to be equal to 3 or
5318      be a power of two.  */
5319   if (count != 3 && exact_log2 (count) == -1)
5320     {
5321       if (dump_enabled_p ())
5322         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5323                          "the size of the group of accesses"
5324                          " is not a power of 2 or not eqaul to 3\n");
5325       return false;
5326     }
5327
5328   /* Check that the permutation is supported.  */
5329   if (VECTOR_MODE_P (mode))
5330     {
5331       unsigned int i;
5332       if (count == 3)
5333         {
5334           unsigned int j0 = 0, j1 = 0, j2 = 0;
5335           unsigned int i, j;
5336
5337           unsigned int nelt;
5338           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5339             {
5340               if (dump_enabled_p ())
5341                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5342                                  "cannot handle groups of 3 stores for"
5343                                  " variable-length vectors\n");
5344               return false;
5345             }
5346
5347           vec_perm_builder sel (nelt, nelt, 1);
5348           sel.quick_grow (nelt);
5349           vec_perm_indices indices;
5350           for (j = 0; j < 3; j++)
5351             {
5352               int nelt0 = ((3 - j) * nelt) % 3;
5353               int nelt1 = ((3 - j) * nelt + 1) % 3;
5354               int nelt2 = ((3 - j) * nelt + 2) % 3;
5355               for (i = 0; i < nelt; i++)
5356                 {
5357                   if (3 * i + nelt0 < nelt)
5358                     sel[3 * i + nelt0] = j0++;
5359                   if (3 * i + nelt1 < nelt)
5360                     sel[3 * i + nelt1] = nelt + j1++;
5361                   if (3 * i + nelt2 < nelt)
5362                     sel[3 * i + nelt2] = 0;
5363                 }
5364               indices.new_vector (sel, 2, nelt);
5365               if (!can_vec_perm_const_p (mode, mode, indices))
5366                 {
5367                   if (dump_enabled_p ())
5368                     dump_printf (MSG_MISSED_OPTIMIZATION,
5369                                  "permutation op not supported by target.\n");
5370                   return false;
5371                 }
5372
5373               for (i = 0; i < nelt; i++)
5374                 {
5375                   if (3 * i + nelt0 < nelt)
5376                     sel[3 * i + nelt0] = 3 * i + nelt0;
5377                   if (3 * i + nelt1 < nelt)
5378                     sel[3 * i + nelt1] = 3 * i + nelt1;
5379                   if (3 * i + nelt2 < nelt)
5380                     sel[3 * i + nelt2] = nelt + j2++;
5381                 }
5382               indices.new_vector (sel, 2, nelt);
5383               if (!can_vec_perm_const_p (mode, mode, indices))
5384                 {
5385                   if (dump_enabled_p ())
5386                     dump_printf (MSG_MISSED_OPTIMIZATION,
5387                                  "permutation op not supported by target.\n");
5388                   return false;
5389                 }
5390             }
5391           return true;
5392         }
5393       else
5394         {
5395           /* If length is not equal to 3 then only power of 2 is supported.  */
5396           gcc_assert (pow2p_hwi (count));
5397           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5398
5399           /* The encoding has 2 interleaved stepped patterns.  */
5400           if(!multiple_p (nelt, 2))
5401             return false;
5402           vec_perm_builder sel (nelt, 2, 3);
5403           sel.quick_grow (6);
5404           for (i = 0; i < 3; i++)
5405             {
5406               sel[i * 2] = i;
5407               sel[i * 2 + 1] = i + nelt;
5408             }
5409           vec_perm_indices indices (sel, 2, nelt);
5410           if (can_vec_perm_const_p (mode, mode, indices))
5411             {
5412               for (i = 0; i < 6; i++)
5413                 sel[i] += exact_div (nelt, 2);
5414               indices.new_vector (sel, 2, nelt);
5415               if (can_vec_perm_const_p (mode, mode, indices))
5416                 return true;
5417             }
5418         }
5419     }
5420
5421   if (dump_enabled_p ())
5422     dump_printf (MSG_MISSED_OPTIMIZATION,
5423                  "permutation op not supported by target.\n");
5424   return false;
5425 }
5426
5427
5428 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5429    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5430
5431 bool
5432 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5433                             bool masked_p)
5434 {
5435   if (masked_p)
5436     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5437                                          vec_mask_store_lanes_optab,
5438                                          vectype, count);
5439   else
5440     return vect_lanes_optab_supported_p ("vec_store_lanes",
5441                                          vec_store_lanes_optab,
5442                                          vectype, count);
5443 }
5444
5445
5446 /* Function vect_permute_store_chain.
5447
5448    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5449    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5450    the data correctly for the stores.  Return the final references for stores
5451    in RESULT_CHAIN.
5452
5453    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5454    The input is 4 vectors each containing 8 elements.  We assign a number to
5455    each element, the input sequence is:
5456
5457    1st vec:   0  1  2  3  4  5  6  7
5458    2nd vec:   8  9 10 11 12 13 14 15
5459    3rd vec:  16 17 18 19 20 21 22 23
5460    4th vec:  24 25 26 27 28 29 30 31
5461
5462    The output sequence should be:
5463
5464    1st vec:  0  8 16 24  1  9 17 25
5465    2nd vec:  2 10 18 26  3 11 19 27
5466    3rd vec:  4 12 20 28  5 13 21 30
5467    4th vec:  6 14 22 30  7 15 23 31
5468
5469    i.e., we interleave the contents of the four vectors in their order.
5470
5471    We use interleave_high/low instructions to create such output.  The input of
5472    each interleave_high/low operation is two vectors:
5473    1st vec    2nd vec
5474    0 1 2 3    4 5 6 7
5475    the even elements of the result vector are obtained left-to-right from the
5476    high/low elements of the first vector.  The odd elements of the result are
5477    obtained left-to-right from the high/low elements of the second vector.
5478    The output of interleave_high will be:   0 4 1 5
5479    and of interleave_low:                   2 6 3 7
5480
5481
5482    The permutation is done in log LENGTH stages.  In each stage interleave_high
5483    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5484    where the first argument is taken from the first half of DR_CHAIN and the
5485    second argument from it's second half.
5486    In our example,
5487
5488    I1: interleave_high (1st vec, 3rd vec)
5489    I2: interleave_low (1st vec, 3rd vec)
5490    I3: interleave_high (2nd vec, 4th vec)
5491    I4: interleave_low (2nd vec, 4th vec)
5492
5493    The output for the first stage is:
5494
5495    I1:  0 16  1 17  2 18  3 19
5496    I2:  4 20  5 21  6 22  7 23
5497    I3:  8 24  9 25 10 26 11 27
5498    I4: 12 28 13 29 14 30 15 31
5499
5500    The output of the second stage, i.e. the final result is:
5501
5502    I1:  0  8 16 24  1  9 17 25
5503    I2:  2 10 18 26  3 11 19 27
5504    I3:  4 12 20 28  5 13 21 30
5505    I4:  6 14 22 30  7 15 23 31.  */
5506
5507 void
5508 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5509                           unsigned int length,
5510                           stmt_vec_info stmt_info,
5511                           gimple_stmt_iterator *gsi,
5512                           vec<tree> *result_chain)
5513 {
5514   tree vect1, vect2, high, low;
5515   gimple *perm_stmt;
5516   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5517   tree perm_mask_low, perm_mask_high;
5518   tree data_ref;
5519   tree perm3_mask_low, perm3_mask_high;
5520   unsigned int i, j, n, log_length = exact_log2 (length);
5521
5522   result_chain->quick_grow (length);
5523   memcpy (result_chain->address (), dr_chain.address (),
5524           length * sizeof (tree));
5525
5526   if (length == 3)
5527     {
5528       /* vect_grouped_store_supported ensures that this is constant.  */
5529       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5530       unsigned int j0 = 0, j1 = 0, j2 = 0;
5531
5532       vec_perm_builder sel (nelt, nelt, 1);
5533       sel.quick_grow (nelt);
5534       vec_perm_indices indices;
5535       for (j = 0; j < 3; j++)
5536         {
5537           int nelt0 = ((3 - j) * nelt) % 3;
5538           int nelt1 = ((3 - j) * nelt + 1) % 3;
5539           int nelt2 = ((3 - j) * nelt + 2) % 3;
5540
5541           for (i = 0; i < nelt; i++)
5542             {
5543               if (3 * i + nelt0 < nelt)
5544                 sel[3 * i + nelt0] = j0++;
5545               if (3 * i + nelt1 < nelt)
5546                 sel[3 * i + nelt1] = nelt + j1++;
5547               if (3 * i + nelt2 < nelt)
5548                 sel[3 * i + nelt2] = 0;
5549             }
5550           indices.new_vector (sel, 2, nelt);
5551           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5552
5553           for (i = 0; i < nelt; i++)
5554             {
5555               if (3 * i + nelt0 < nelt)
5556                 sel[3 * i + nelt0] = 3 * i + nelt0;
5557               if (3 * i + nelt1 < nelt)
5558                 sel[3 * i + nelt1] = 3 * i + nelt1;
5559               if (3 * i + nelt2 < nelt)
5560                 sel[3 * i + nelt2] = nelt + j2++;
5561             }
5562           indices.new_vector (sel, 2, nelt);
5563           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5564
5565           vect1 = dr_chain[0];
5566           vect2 = dr_chain[1];
5567
5568           /* Create interleaving stmt:
5569              low = VEC_PERM_EXPR <vect1, vect2,
5570                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5571                                    j + 2, nelt + j + 2, *, ...}>  */
5572           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5573           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5574                                            vect2, perm3_mask_low);
5575           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5576
5577           vect1 = data_ref;
5578           vect2 = dr_chain[2];
5579           /* Create interleaving stmt:
5580              low = VEC_PERM_EXPR <vect1, vect2,
5581                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5582                                    6, 7, nelt + j + 2, ...}>  */
5583           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5584           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5585                                            vect2, perm3_mask_high);
5586           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5587           (*result_chain)[j] = data_ref;
5588         }
5589     }
5590   else
5591     {
5592       /* If length is not equal to 3 then only power of 2 is supported.  */
5593       gcc_assert (pow2p_hwi (length));
5594
5595       /* The encoding has 2 interleaved stepped patterns.  */
5596       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5597       vec_perm_builder sel (nelt, 2, 3);
5598       sel.quick_grow (6);
5599       for (i = 0; i < 3; i++)
5600         {
5601           sel[i * 2] = i;
5602           sel[i * 2 + 1] = i + nelt;
5603         }
5604         vec_perm_indices indices (sel, 2, nelt);
5605         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5606
5607         for (i = 0; i < 6; i++)
5608           sel[i] += exact_div (nelt, 2);
5609         indices.new_vector (sel, 2, nelt);
5610         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5611
5612         for (i = 0, n = log_length; i < n; i++)
5613           {
5614             for (j = 0; j < length/2; j++)
5615               {
5616                 vect1 = dr_chain[j];
5617                 vect2 = dr_chain[j+length/2];
5618
5619                 /* Create interleaving stmt:
5620                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5621                                                         ...}>  */
5622                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5623                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5624                                                  vect2, perm_mask_high);
5625                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5626                 (*result_chain)[2*j] = high;
5627
5628                 /* Create interleaving stmt:
5629                    low = VEC_PERM_EXPR <vect1, vect2,
5630                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5631                                          ...}>  */
5632                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5633                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5634                                                  vect2, perm_mask_low);
5635                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5636                 (*result_chain)[2*j+1] = low;
5637               }
5638             memcpy (dr_chain.address (), result_chain->address (),
5639                     length * sizeof (tree));
5640           }
5641     }
5642 }
5643
5644 /* Function vect_setup_realignment
5645
5646    This function is called when vectorizing an unaligned load using
5647    the dr_explicit_realign[_optimized] scheme.
5648    This function generates the following code at the loop prolog:
5649
5650       p = initial_addr;
5651    x  msq_init = *(floor(p));   # prolog load
5652       realignment_token = call target_builtin;
5653     loop:
5654    x  msq = phi (msq_init, ---)
5655
5656    The stmts marked with x are generated only for the case of
5657    dr_explicit_realign_optimized.
5658
5659    The code above sets up a new (vector) pointer, pointing to the first
5660    location accessed by STMT_INFO, and a "floor-aligned" load using that
5661    pointer.  It also generates code to compute the "realignment-token"
5662    (if the relevant target hook was defined), and creates a phi-node at the
5663    loop-header bb whose arguments are the result of the prolog-load (created
5664    by this function) and the result of a load that takes place in the loop
5665    (to be created by the caller to this function).
5666
5667    For the case of dr_explicit_realign_optimized:
5668    The caller to this function uses the phi-result (msq) to create the
5669    realignment code inside the loop, and sets up the missing phi argument,
5670    as follows:
5671     loop:
5672       msq = phi (msq_init, lsq)
5673       lsq = *(floor(p'));        # load in loop
5674       result = realign_load (msq, lsq, realignment_token);
5675
5676    For the case of dr_explicit_realign:
5677     loop:
5678       msq = *(floor(p));        # load in loop
5679       p' = p + (VS-1);
5680       lsq = *(floor(p'));       # load in loop
5681       result = realign_load (msq, lsq, realignment_token);
5682
5683    Input:
5684    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5685                a memory location that may be unaligned.
5686    BSI - place where new code is to be inserted.
5687    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5688                               is used.
5689
5690    Output:
5691    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5692                        target hook, if defined.
5693    Return value - the result of the loop-header phi node.  */
5694
5695 tree
5696 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5697                         gimple_stmt_iterator *gsi, tree *realignment_token,
5698                         enum dr_alignment_support alignment_support_scheme,
5699                         tree init_addr,
5700                         class loop **at_loop)
5701 {
5702   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5703   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5704   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5705   struct data_reference *dr = dr_info->dr;
5706   class loop *loop = NULL;
5707   edge pe = NULL;
5708   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5709   tree vec_dest;
5710   gimple *inc;
5711   tree ptr;
5712   tree data_ref;
5713   basic_block new_bb;
5714   tree msq_init = NULL_TREE;
5715   tree new_temp;
5716   gphi *phi_stmt;
5717   tree msq = NULL_TREE;
5718   gimple_seq stmts = NULL;
5719   bool compute_in_loop = false;
5720   bool nested_in_vect_loop = false;
5721   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5722   class loop *loop_for_initial_load = NULL;
5723
5724   if (loop_vinfo)
5725     {
5726       loop = LOOP_VINFO_LOOP (loop_vinfo);
5727       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5728     }
5729
5730   gcc_assert (alignment_support_scheme == dr_explicit_realign
5731               || alignment_support_scheme == dr_explicit_realign_optimized);
5732
5733   /* We need to generate three things:
5734      1. the misalignment computation
5735      2. the extra vector load (for the optimized realignment scheme).
5736      3. the phi node for the two vectors from which the realignment is
5737       done (for the optimized realignment scheme).  */
5738
5739   /* 1. Determine where to generate the misalignment computation.
5740
5741      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5742      calculation will be generated by this function, outside the loop (in the
5743      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5744      caller, inside the loop.
5745
5746      Background: If the misalignment remains fixed throughout the iterations of
5747      the loop, then both realignment schemes are applicable, and also the
5748      misalignment computation can be done outside LOOP.  This is because we are
5749      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5750      are a multiple of VS (the Vector Size), and therefore the misalignment in
5751      different vectorized LOOP iterations is always the same.
5752      The problem arises only if the memory access is in an inner-loop nested
5753      inside LOOP, which is now being vectorized using outer-loop vectorization.
5754      This is the only case when the misalignment of the memory access may not
5755      remain fixed throughout the iterations of the inner-loop (as explained in
5756      detail in vect_supportable_dr_alignment).  In this case, not only is the
5757      optimized realignment scheme not applicable, but also the misalignment
5758      computation (and generation of the realignment token that is passed to
5759      REALIGN_LOAD) have to be done inside the loop.
5760
5761      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5762      or not, which in turn determines if the misalignment is computed inside
5763      the inner-loop, or outside LOOP.  */
5764
5765   if (init_addr != NULL_TREE || !loop_vinfo)
5766     {
5767       compute_in_loop = true;
5768       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5769     }
5770
5771
5772   /* 2. Determine where to generate the extra vector load.
5773
5774      For the optimized realignment scheme, instead of generating two vector
5775      loads in each iteration, we generate a single extra vector load in the
5776      preheader of the loop, and in each iteration reuse the result of the
5777      vector load from the previous iteration.  In case the memory access is in
5778      an inner-loop nested inside LOOP, which is now being vectorized using
5779      outer-loop vectorization, we need to determine whether this initial vector
5780      load should be generated at the preheader of the inner-loop, or can be
5781      generated at the preheader of LOOP.  If the memory access has no evolution
5782      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5783      to be generated inside LOOP (in the preheader of the inner-loop).  */
5784
5785   if (nested_in_vect_loop)
5786     {
5787       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5788       bool invariant_in_outerloop =
5789             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5790       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5791     }
5792   else
5793     loop_for_initial_load = loop;
5794   if (at_loop)
5795     *at_loop = loop_for_initial_load;
5796
5797   tree vuse = NULL_TREE;
5798   if (loop_for_initial_load)
5799     {
5800       pe = loop_preheader_edge (loop_for_initial_load);
5801       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
5802         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
5803     }
5804   if (!vuse)
5805     vuse = gimple_vuse (gsi_stmt (*gsi));
5806
5807   /* 3. For the case of the optimized realignment, create the first vector
5808       load at the loop preheader.  */
5809
5810   if (alignment_support_scheme == dr_explicit_realign_optimized)
5811     {
5812       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5813       gassign *new_stmt;
5814
5815       gcc_assert (!compute_in_loop);
5816       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5817       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5818                                       loop_for_initial_load, NULL_TREE,
5819                                       &init_addr, NULL, &inc, true);
5820       if (TREE_CODE (ptr) == SSA_NAME)
5821         new_temp = copy_ssa_name (ptr);
5822       else
5823         new_temp = make_ssa_name (TREE_TYPE (ptr));
5824       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5825       tree type = TREE_TYPE (ptr);
5826       new_stmt = gimple_build_assign
5827                    (new_temp, BIT_AND_EXPR, ptr,
5828                     fold_build2 (MINUS_EXPR, type,
5829                                  build_int_cst (type, 0),
5830                                  build_int_cst (type, align)));
5831       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5832       gcc_assert (!new_bb);
5833       data_ref
5834         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5835                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5836       vect_copy_ref_info (data_ref, DR_REF (dr));
5837       new_stmt = gimple_build_assign (vec_dest, data_ref);
5838       new_temp = make_ssa_name (vec_dest, new_stmt);
5839       gimple_assign_set_lhs (new_stmt, new_temp);
5840       gimple_set_vuse (new_stmt, vuse);
5841       if (pe)
5842         {
5843           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5844           gcc_assert (!new_bb);
5845         }
5846       else
5847          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5848
5849       msq_init = gimple_assign_lhs (new_stmt);
5850     }
5851
5852   /* 4. Create realignment token using a target builtin, if available.
5853       It is done either inside the containing loop, or before LOOP (as
5854       determined above).  */
5855
5856   if (targetm.vectorize.builtin_mask_for_load)
5857     {
5858       gcall *new_stmt;
5859       tree builtin_decl;
5860
5861       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5862       if (!init_addr)
5863         {
5864           /* Generate the INIT_ADDR computation outside LOOP.  */
5865           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5866                                                             stmt_info, &stmts,
5867                                                             NULL_TREE);
5868           if (loop)
5869             {
5870               pe = loop_preheader_edge (loop);
5871               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5872               gcc_assert (!new_bb);
5873             }
5874           else
5875              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5876         }
5877
5878       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5879       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5880       vec_dest =
5881         vect_create_destination_var (scalar_dest,
5882                                      gimple_call_return_type (new_stmt));
5883       new_temp = make_ssa_name (vec_dest, new_stmt);
5884       gimple_call_set_lhs (new_stmt, new_temp);
5885
5886       if (compute_in_loop)
5887         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5888       else
5889         {
5890           /* Generate the misalignment computation outside LOOP.  */
5891           pe = loop_preheader_edge (loop);
5892           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5893           gcc_assert (!new_bb);
5894         }
5895
5896       *realignment_token = gimple_call_lhs (new_stmt);
5897
5898       /* The result of the CALL_EXPR to this builtin is determined from
5899          the value of the parameter and no global variables are touched
5900          which makes the builtin a "const" function.  Requiring the
5901          builtin to have the "const" attribute makes it unnecessary
5902          to call mark_call_clobbered.  */
5903       gcc_assert (TREE_READONLY (builtin_decl));
5904     }
5905
5906   if (alignment_support_scheme == dr_explicit_realign)
5907     return msq;
5908
5909   gcc_assert (!compute_in_loop);
5910   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5911
5912
5913   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5914
5915   pe = loop_preheader_edge (containing_loop);
5916   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5917   msq = make_ssa_name (vec_dest);
5918   phi_stmt = create_phi_node (msq, containing_loop->header);
5919   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5920
5921   return msq;
5922 }
5923
5924
5925 /* Function vect_grouped_load_supported.
5926
5927    COUNT is the size of the load group (the number of statements plus the
5928    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5929    only one statement, with a gap of COUNT - 1.
5930
5931    Returns true if a suitable permute exists.  */
5932
5933 bool
5934 vect_grouped_load_supported (tree vectype, bool single_element_p,
5935                              unsigned HOST_WIDE_INT count)
5936 {
5937   machine_mode mode = TYPE_MODE (vectype);
5938
5939   /* If this is single-element interleaving with an element distance
5940      that leaves unused vector loads around punt - we at least create
5941      very sub-optimal code in that case (and blow up memory,
5942      see PR65518).  */
5943   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5944     {
5945       if (dump_enabled_p ())
5946         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5947                          "single-element interleaving not supported "
5948                          "for not adjacent vector loads\n");
5949       return false;
5950     }
5951
5952   /* vect_permute_load_chain requires the group size to be equal to 3 or
5953      be a power of two.  */
5954   if (count != 3 && exact_log2 (count) == -1)
5955     {
5956       if (dump_enabled_p ())
5957         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5958                          "the size of the group of accesses"
5959                          " is not a power of 2 or not equal to 3\n");
5960       return false;
5961     }
5962
5963   /* Check that the permutation is supported.  */
5964   if (VECTOR_MODE_P (mode))
5965     {
5966       unsigned int i, j;
5967       if (count == 3)
5968         {
5969           unsigned int nelt;
5970           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5971             {
5972               if (dump_enabled_p ())
5973                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5974                                  "cannot handle groups of 3 loads for"
5975                                  " variable-length vectors\n");
5976               return false;
5977             }
5978
5979           vec_perm_builder sel (nelt, nelt, 1);
5980           sel.quick_grow (nelt);
5981           vec_perm_indices indices;
5982           unsigned int k;
5983           for (k = 0; k < 3; k++)
5984             {
5985               for (i = 0; i < nelt; i++)
5986                 if (3 * i + k < 2 * nelt)
5987                   sel[i] = 3 * i + k;
5988                 else
5989                   sel[i] = 0;
5990               indices.new_vector (sel, 2, nelt);
5991               if (!can_vec_perm_const_p (mode, mode, indices))
5992                 {
5993                   if (dump_enabled_p ())
5994                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5995                                      "shuffle of 3 loads is not supported by"
5996                                      " target\n");
5997                   return false;
5998                 }
5999               for (i = 0, j = 0; i < nelt; i++)
6000                 if (3 * i + k < 2 * nelt)
6001                   sel[i] = i;
6002                 else
6003                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6004               indices.new_vector (sel, 2, nelt);
6005               if (!can_vec_perm_const_p (mode, mode, indices))
6006                 {
6007                   if (dump_enabled_p ())
6008                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6009                                      "shuffle of 3 loads is not supported by"
6010                                      " target\n");
6011                   return false;
6012                 }
6013             }
6014           return true;
6015         }
6016       else
6017         {
6018           /* If length is not equal to 3 then only power of 2 is supported.  */
6019           gcc_assert (pow2p_hwi (count));
6020           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6021
6022           /* The encoding has a single stepped pattern.  */
6023           vec_perm_builder sel (nelt, 1, 3);
6024           sel.quick_grow (3);
6025           for (i = 0; i < 3; i++)
6026             sel[i] = i * 2;
6027           vec_perm_indices indices (sel, 2, nelt);
6028           if (can_vec_perm_const_p (mode, mode, indices))
6029             {
6030               for (i = 0; i < 3; i++)
6031                 sel[i] = i * 2 + 1;
6032               indices.new_vector (sel, 2, nelt);
6033               if (can_vec_perm_const_p (mode, mode, indices))
6034                 return true;
6035             }
6036         }
6037     }
6038
6039   if (dump_enabled_p ())
6040     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6041                      "extract even/odd not supported by target\n");
6042   return false;
6043 }
6044
6045 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6046    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6047
6048 bool
6049 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6050                            bool masked_p)
6051 {
6052   if (masked_p)
6053     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6054                                          vec_mask_load_lanes_optab,
6055                                          vectype, count);
6056   else
6057     return vect_lanes_optab_supported_p ("vec_load_lanes",
6058                                          vec_load_lanes_optab,
6059                                          vectype, count);
6060 }
6061
6062 /* Function vect_permute_load_chain.
6063
6064    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6065    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6066    the input data correctly.  Return the final references for loads in
6067    RESULT_CHAIN.
6068
6069    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6070    The input is 4 vectors each containing 8 elements. We assign a number to each
6071    element, the input sequence is:
6072
6073    1st vec:   0  1  2  3  4  5  6  7
6074    2nd vec:   8  9 10 11 12 13 14 15
6075    3rd vec:  16 17 18 19 20 21 22 23
6076    4th vec:  24 25 26 27 28 29 30 31
6077
6078    The output sequence should be:
6079
6080    1st vec:  0 4  8 12 16 20 24 28
6081    2nd vec:  1 5  9 13 17 21 25 29
6082    3rd vec:  2 6 10 14 18 22 26 30
6083    4th vec:  3 7 11 15 19 23 27 31
6084
6085    i.e., the first output vector should contain the first elements of each
6086    interleaving group, etc.
6087
6088    We use extract_even/odd instructions to create such output.  The input of
6089    each extract_even/odd operation is two vectors
6090    1st vec    2nd vec
6091    0 1 2 3    4 5 6 7
6092
6093    and the output is the vector of extracted even/odd elements.  The output of
6094    extract_even will be:   0 2 4 6
6095    and of extract_odd:     1 3 5 7
6096
6097
6098    The permutation is done in log LENGTH stages.  In each stage extract_even
6099    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6100    their order.  In our example,
6101
6102    E1: extract_even (1st vec, 2nd vec)
6103    E2: extract_odd (1st vec, 2nd vec)
6104    E3: extract_even (3rd vec, 4th vec)
6105    E4: extract_odd (3rd vec, 4th vec)
6106
6107    The output for the first stage will be:
6108
6109    E1:  0  2  4  6  8 10 12 14
6110    E2:  1  3  5  7  9 11 13 15
6111    E3: 16 18 20 22 24 26 28 30
6112    E4: 17 19 21 23 25 27 29 31
6113
6114    In order to proceed and create the correct sequence for the next stage (or
6115    for the correct output, if the second stage is the last one, as in our
6116    example), we first put the output of extract_even operation and then the
6117    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6118    The input for the second stage is:
6119
6120    1st vec (E1):  0  2  4  6  8 10 12 14
6121    2nd vec (E3): 16 18 20 22 24 26 28 30
6122    3rd vec (E2):  1  3  5  7  9 11 13 15
6123    4th vec (E4): 17 19 21 23 25 27 29 31
6124
6125    The output of the second stage:
6126
6127    E1: 0 4  8 12 16 20 24 28
6128    E2: 2 6 10 14 18 22 26 30
6129    E3: 1 5  9 13 17 21 25 29
6130    E4: 3 7 11 15 19 23 27 31
6131
6132    And RESULT_CHAIN after reordering:
6133
6134    1st vec (E1):  0 4  8 12 16 20 24 28
6135    2nd vec (E3):  1 5  9 13 17 21 25 29
6136    3rd vec (E2):  2 6 10 14 18 22 26 30
6137    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6138
6139 static void
6140 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6141                          unsigned int length,
6142                          stmt_vec_info stmt_info,
6143                          gimple_stmt_iterator *gsi,
6144                          vec<tree> *result_chain)
6145 {
6146   tree data_ref, first_vect, second_vect;
6147   tree perm_mask_even, perm_mask_odd;
6148   tree perm3_mask_low, perm3_mask_high;
6149   gimple *perm_stmt;
6150   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6151   unsigned int i, j, log_length = exact_log2 (length);
6152
6153   result_chain->quick_grow (length);
6154   memcpy (result_chain->address (), dr_chain.address (),
6155           length * sizeof (tree));
6156
6157   if (length == 3)
6158     {
6159       /* vect_grouped_load_supported ensures that this is constant.  */
6160       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6161       unsigned int k;
6162
6163       vec_perm_builder sel (nelt, nelt, 1);
6164       sel.quick_grow (nelt);
6165       vec_perm_indices indices;
6166       for (k = 0; k < 3; k++)
6167         {
6168           for (i = 0; i < nelt; i++)
6169             if (3 * i + k < 2 * nelt)
6170               sel[i] = 3 * i + k;
6171             else
6172               sel[i] = 0;
6173           indices.new_vector (sel, 2, nelt);
6174           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6175
6176           for (i = 0, j = 0; i < nelt; i++)
6177             if (3 * i + k < 2 * nelt)
6178               sel[i] = i;
6179             else
6180               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6181           indices.new_vector (sel, 2, nelt);
6182           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6183
6184           first_vect = dr_chain[0];
6185           second_vect = dr_chain[1];
6186
6187           /* Create interleaving stmt (low part of):
6188              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6189                                                              ...}>  */
6190           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6191           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6192                                            second_vect, perm3_mask_low);
6193           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6194
6195           /* Create interleaving stmt (high part of):
6196              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6197                                                               ...}>  */
6198           first_vect = data_ref;
6199           second_vect = dr_chain[2];
6200           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6201           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6202                                            second_vect, perm3_mask_high);
6203           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6204           (*result_chain)[k] = data_ref;
6205         }
6206     }
6207   else
6208     {
6209       /* If length is not equal to 3 then only power of 2 is supported.  */
6210       gcc_assert (pow2p_hwi (length));
6211
6212       /* The encoding has a single stepped pattern.  */
6213       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6214       vec_perm_builder sel (nelt, 1, 3);
6215       sel.quick_grow (3);
6216       for (i = 0; i < 3; ++i)
6217         sel[i] = i * 2;
6218       vec_perm_indices indices (sel, 2, nelt);
6219       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6220
6221       for (i = 0; i < 3; ++i)
6222         sel[i] = i * 2 + 1;
6223       indices.new_vector (sel, 2, nelt);
6224       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6225
6226       for (i = 0; i < log_length; i++)
6227         {
6228           for (j = 0; j < length; j += 2)
6229             {
6230               first_vect = dr_chain[j];
6231               second_vect = dr_chain[j+1];
6232
6233               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6234               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6235               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6236                                                first_vect, second_vect,
6237                                                perm_mask_even);
6238               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6239               (*result_chain)[j/2] = data_ref;
6240
6241               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6242               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6243               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6244                                                first_vect, second_vect,
6245                                                perm_mask_odd);
6246               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6247               (*result_chain)[j/2+length/2] = data_ref;
6248             }
6249           memcpy (dr_chain.address (), result_chain->address (),
6250                   length * sizeof (tree));
6251         }
6252     }
6253 }
6254
6255 /* Function vect_shift_permute_load_chain.
6256
6257    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6258    sequence of stmts to reorder the input data accordingly.
6259    Return the final references for loads in RESULT_CHAIN.
6260    Return true if successed, false otherwise.
6261
6262    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6263    The input is 3 vectors each containing 8 elements.  We assign a
6264    number to each element, the input sequence is:
6265
6266    1st vec:   0  1  2  3  4  5  6  7
6267    2nd vec:   8  9 10 11 12 13 14 15
6268    3rd vec:  16 17 18 19 20 21 22 23
6269
6270    The output sequence should be:
6271
6272    1st vec:  0 3 6  9 12 15 18 21
6273    2nd vec:  1 4 7 10 13 16 19 22
6274    3rd vec:  2 5 8 11 14 17 20 23
6275
6276    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6277
6278    First we shuffle all 3 vectors to get correct elements order:
6279
6280    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6281    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6282    3rd vec:  (16 19 22) (17 20 23) (18 21)
6283
6284    Next we unite and shift vector 3 times:
6285
6286    1st step:
6287      shift right by 6 the concatenation of:
6288      "1st vec" and  "2nd vec"
6289        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6290      "2nd vec" and  "3rd vec"
6291        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6292      "3rd vec" and  "1st vec"
6293        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6294                              | New vectors                   |
6295
6296      So that now new vectors are:
6297
6298      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6299      2nd vec:  (10 13) (16 19 22) (17 20 23)
6300      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6301
6302    2nd step:
6303      shift right by 5 the concatenation of:
6304      "1st vec" and  "3rd vec"
6305        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6306      "2nd vec" and  "1st vec"
6307        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6308      "3rd vec" and  "2nd vec"
6309        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6310                           | New vectors                   |
6311
6312      So that now new vectors are:
6313
6314      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6315      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6316      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6317
6318    3rd step:
6319      shift right by 5 the concatenation of:
6320      "1st vec" and  "1st vec"
6321        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6322      shift right by 3 the concatenation of:
6323      "2nd vec" and  "2nd vec"
6324                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6325                           | New vectors                   |
6326
6327      So that now all vectors are READY:
6328      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6329      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6330      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6331
6332    This algorithm is faster than one in vect_permute_load_chain if:
6333      1.  "shift of a concatination" is faster than general permutation.
6334          This is usually so.
6335      2.  The TARGET machine can't execute vector instructions in parallel.
6336          This is because each step of the algorithm depends on previous.
6337          The algorithm in vect_permute_load_chain is much more parallel.
6338
6339    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6340 */
6341
6342 static bool
6343 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6344                                unsigned int length,
6345                                stmt_vec_info stmt_info,
6346                                gimple_stmt_iterator *gsi,
6347                                vec<tree> *result_chain)
6348 {
6349   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6350   tree perm2_mask1, perm2_mask2, perm3_mask;
6351   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6352   gimple *perm_stmt;
6353
6354   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6355   machine_mode vmode = TYPE_MODE (vectype);
6356   unsigned int i;
6357   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6358
6359   unsigned HOST_WIDE_INT nelt, vf;
6360   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6361       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6362     /* Not supported for variable-length vectors.  */
6363     return false;
6364
6365   vec_perm_builder sel (nelt, nelt, 1);
6366   sel.quick_grow (nelt);
6367
6368   result_chain->quick_grow (length);
6369   memcpy (result_chain->address (), dr_chain.address (),
6370           length * sizeof (tree));
6371
6372   if (pow2p_hwi (length) && vf > 4)
6373     {
6374       unsigned int j, log_length = exact_log2 (length);
6375       for (i = 0; i < nelt / 2; ++i)
6376         sel[i] = i * 2;
6377       for (i = 0; i < nelt / 2; ++i)
6378         sel[nelt / 2 + i] = i * 2 + 1;
6379       vec_perm_indices indices (sel, 2, nelt);
6380       if (!can_vec_perm_const_p (vmode, vmode, indices))
6381         {
6382           if (dump_enabled_p ())
6383             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6384                              "shuffle of 2 fields structure is not \
6385                               supported by target\n");
6386           return false;
6387         }
6388       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6389
6390       for (i = 0; i < nelt / 2; ++i)
6391         sel[i] = i * 2 + 1;
6392       for (i = 0; i < nelt / 2; ++i)
6393         sel[nelt / 2 + i] = i * 2;
6394       indices.new_vector (sel, 2, nelt);
6395       if (!can_vec_perm_const_p (vmode, vmode, indices))
6396         {
6397           if (dump_enabled_p ())
6398             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6399                              "shuffle of 2 fields structure is not \
6400                               supported by target\n");
6401           return false;
6402         }
6403       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6404
6405       /* Generating permutation constant to shift all elements.
6406          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6407       for (i = 0; i < nelt; i++)
6408         sel[i] = nelt / 2 + i;
6409       indices.new_vector (sel, 2, nelt);
6410       if (!can_vec_perm_const_p (vmode, vmode, indices))
6411         {
6412           if (dump_enabled_p ())
6413             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6414                              "shift permutation is not supported by target\n");
6415           return false;
6416         }
6417       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6418
6419       /* Generating permutation constant to select vector from 2.
6420          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6421       for (i = 0; i < nelt / 2; i++)
6422         sel[i] = i;
6423       for (i = nelt / 2; i < nelt; i++)
6424         sel[i] = nelt + i;
6425       indices.new_vector (sel, 2, nelt);
6426       if (!can_vec_perm_const_p (vmode, vmode, indices))
6427         {
6428           if (dump_enabled_p ())
6429             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6430                              "select is not supported by target\n");
6431           return false;
6432         }
6433       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6434
6435       for (i = 0; i < log_length; i++)
6436         {
6437           for (j = 0; j < length; j += 2)
6438             {
6439               first_vect = dr_chain[j];
6440               second_vect = dr_chain[j + 1];
6441
6442               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6443               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6444                                                first_vect, first_vect,
6445                                                perm2_mask1);
6446               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6447               vect[0] = data_ref;
6448
6449               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6450               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6451                                                second_vect, second_vect,
6452                                                perm2_mask2);
6453               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6454               vect[1] = data_ref;
6455
6456               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6457               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6458                                                vect[0], vect[1], shift1_mask);
6459               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6460               (*result_chain)[j/2 + length/2] = data_ref;
6461
6462               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6463               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6464                                                vect[0], vect[1], select_mask);
6465               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6466               (*result_chain)[j/2] = data_ref;
6467             }
6468           memcpy (dr_chain.address (), result_chain->address (),
6469                   length * sizeof (tree));
6470         }
6471       return true;
6472     }
6473   if (length == 3 && vf > 2)
6474     {
6475       unsigned int k = 0, l = 0;
6476
6477       /* Generating permutation constant to get all elements in rigth order.
6478          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6479       for (i = 0; i < nelt; i++)
6480         {
6481           if (3 * k + (l % 3) >= nelt)
6482             {
6483               k = 0;
6484               l += (3 - (nelt % 3));
6485             }
6486           sel[i] = 3 * k + (l % 3);
6487           k++;
6488         }
6489       vec_perm_indices indices (sel, 2, nelt);
6490       if (!can_vec_perm_const_p (vmode, vmode, indices))
6491         {
6492           if (dump_enabled_p ())
6493             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6494                              "shuffle of 3 fields structure is not \
6495                               supported by target\n");
6496           return false;
6497         }
6498       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6499
6500       /* Generating permutation constant to shift all elements.
6501          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6502       for (i = 0; i < nelt; i++)
6503         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6504       indices.new_vector (sel, 2, nelt);
6505       if (!can_vec_perm_const_p (vmode, vmode, indices))
6506         {
6507           if (dump_enabled_p ())
6508             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6509                              "shift permutation is not supported by target\n");
6510           return false;
6511         }
6512       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6513
6514       /* Generating permutation constant to shift all elements.
6515          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6516       for (i = 0; i < nelt; i++)
6517         sel[i] = 2 * (nelt / 3) + 1 + i;
6518       indices.new_vector (sel, 2, nelt);
6519       if (!can_vec_perm_const_p (vmode, vmode, indices))
6520         {
6521           if (dump_enabled_p ())
6522             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6523                              "shift permutation is not supported by target\n");
6524           return false;
6525         }
6526       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6527
6528       /* Generating permutation constant to shift all elements.
6529          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6530       for (i = 0; i < nelt; i++)
6531         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6532       indices.new_vector (sel, 2, nelt);
6533       if (!can_vec_perm_const_p (vmode, vmode, indices))
6534         {
6535           if (dump_enabled_p ())
6536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6537                              "shift permutation is not supported by target\n");
6538           return false;
6539         }
6540       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6541
6542       /* Generating permutation constant to shift all elements.
6543          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6544       for (i = 0; i < nelt; i++)
6545         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6546       indices.new_vector (sel, 2, nelt);
6547       if (!can_vec_perm_const_p (vmode, vmode, indices))
6548         {
6549           if (dump_enabled_p ())
6550             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6551                              "shift permutation is not supported by target\n");
6552           return false;
6553         }
6554       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6555
6556       for (k = 0; k < 3; k++)
6557         {
6558           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6559           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6560                                            dr_chain[k], dr_chain[k],
6561                                            perm3_mask);
6562           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6563           vect[k] = data_ref;
6564         }
6565
6566       for (k = 0; k < 3; k++)
6567         {
6568           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6569           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6570                                            vect[k % 3], vect[(k + 1) % 3],
6571                                            shift1_mask);
6572           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6573           vect_shift[k] = data_ref;
6574         }
6575
6576       for (k = 0; k < 3; k++)
6577         {
6578           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6579           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6580                                            vect_shift[(4 - k) % 3],
6581                                            vect_shift[(3 - k) % 3],
6582                                            shift2_mask);
6583           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6584           vect[k] = data_ref;
6585         }
6586
6587       (*result_chain)[3 - (nelt % 3)] = vect[2];
6588
6589       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6590       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6591                                        vect[0], shift3_mask);
6592       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6593       (*result_chain)[nelt % 3] = data_ref;
6594
6595       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6596       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6597                                        vect[1], shift4_mask);
6598       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6599       (*result_chain)[0] = data_ref;
6600       return true;
6601     }
6602   return false;
6603 }
6604
6605 /* Function vect_transform_grouped_load.
6606
6607    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6608    to perform their permutation and ascribe the result vectorized statements to
6609    the scalar statements.
6610 */
6611
6612 void
6613 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6614                              vec<tree> dr_chain,
6615                              int size, gimple_stmt_iterator *gsi)
6616 {
6617   machine_mode mode;
6618   vec<tree> result_chain = vNULL;
6619
6620   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6621      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6622      vectors, that are ready for vector computation.  */
6623   result_chain.create (size);
6624
6625   /* If reassociation width for vector type is 2 or greater target machine can
6626      execute 2 or more vector instructions in parallel.  Otherwise try to
6627      get chain for loads group using vect_shift_permute_load_chain.  */
6628   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6629   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6630       || pow2p_hwi (size)
6631       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6632                                          gsi, &result_chain))
6633     vect_permute_load_chain (vinfo, dr_chain,
6634                              size, stmt_info, gsi, &result_chain);
6635   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6636   result_chain.release ();
6637 }
6638
6639 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6640    generated as part of the vectorization of STMT_INFO.  Assign the statement
6641    for each vector to the associated scalar statement.  */
6642
6643 void
6644 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6645                                   vec<tree> result_chain)
6646 {
6647   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6648   unsigned int i, gap_count;
6649   tree tmp_data_ref;
6650
6651   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6652      Since we scan the chain starting from it's first node, their order
6653      corresponds the order of data-refs in RESULT_CHAIN.  */
6654   stmt_vec_info next_stmt_info = first_stmt_info;
6655   gap_count = 1;
6656   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6657     {
6658       if (!next_stmt_info)
6659         break;
6660
6661       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6662        code elimination pass later.  No need to check for the first stmt in
6663        the group, since it always exists.
6664        DR_GROUP_GAP is the number of steps in elements from the previous
6665        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6666        correspond to the gaps.  */
6667       if (next_stmt_info != first_stmt_info
6668           && gap_count < DR_GROUP_GAP (next_stmt_info))
6669         {
6670           gap_count++;
6671           continue;
6672         }
6673
6674       /* ???  The following needs cleanup after the removal of
6675          DR_GROUP_SAME_DR_STMT.  */
6676       if (next_stmt_info)
6677         {
6678           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6679           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6680              copies, and we put the new vector statement last.  */
6681           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6682
6683           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6684           gap_count = 1;
6685         }
6686     }
6687 }
6688
6689 /* Function vect_force_dr_alignment_p.
6690
6691    Returns whether the alignment of a DECL can be forced to be aligned
6692    on ALIGNMENT bit boundary.  */
6693
6694 bool
6695 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6696 {
6697   if (!VAR_P (decl))
6698     return false;
6699
6700   if (decl_in_symtab_p (decl)
6701       && !symtab_node::get (decl)->can_increase_alignment_p ())
6702     return false;
6703
6704   if (TREE_STATIC (decl))
6705     return (known_le (alignment,
6706                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6707   else
6708     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6709 }
6710
6711 /* Return whether the data reference DR_INFO is supported with respect to its
6712    alignment.
6713    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6714    it is aligned, i.e., check if it is possible to vectorize it with different
6715    alignment.  */
6716
6717 enum dr_alignment_support
6718 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6719                                tree vectype, int misalignment)
6720 {
6721   data_reference *dr = dr_info->dr;
6722   stmt_vec_info stmt_info = dr_info->stmt;
6723   machine_mode mode = TYPE_MODE (vectype);
6724   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6725   class loop *vect_loop = NULL;
6726   bool nested_in_vect_loop = false;
6727
6728   if (misalignment == 0)
6729     return dr_aligned;
6730
6731   /* For now assume all conditional loads/stores support unaligned
6732      access without any special code.  */
6733   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6734     if (gimple_call_internal_p (stmt)
6735         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6736             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6737       return dr_unaligned_supported;
6738
6739   if (loop_vinfo)
6740     {
6741       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6742       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6743     }
6744
6745   /* Possibly unaligned access.  */
6746
6747   /* We can choose between using the implicit realignment scheme (generating
6748      a misaligned_move stmt) and the explicit realignment scheme (generating
6749      aligned loads with a REALIGN_LOAD).  There are two variants to the
6750      explicit realignment scheme: optimized, and unoptimized.
6751      We can optimize the realignment only if the step between consecutive
6752      vector loads is equal to the vector size.  Since the vector memory
6753      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6754      is guaranteed that the misalignment amount remains the same throughout the
6755      execution of the vectorized loop.  Therefore, we can create the
6756      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6757      at the loop preheader.
6758
6759      However, in the case of outer-loop vectorization, when vectorizing a
6760      memory access in the inner-loop nested within the LOOP that is now being
6761      vectorized, while it is guaranteed that the misalignment of the
6762      vectorized memory access will remain the same in different outer-loop
6763      iterations, it is *not* guaranteed that is will remain the same throughout
6764      the execution of the inner-loop.  This is because the inner-loop advances
6765      with the original scalar step (and not in steps of VS).  If the inner-loop
6766      step happens to be a multiple of VS, then the misalignment remains fixed
6767      and we can use the optimized realignment scheme.  For example:
6768
6769       for (i=0; i<N; i++)
6770         for (j=0; j<M; j++)
6771           s += a[i+j];
6772
6773      When vectorizing the i-loop in the above example, the step between
6774      consecutive vector loads is 1, and so the misalignment does not remain
6775      fixed across the execution of the inner-loop, and the realignment cannot
6776      be optimized (as illustrated in the following pseudo vectorized loop):
6777
6778       for (i=0; i<N; i+=4)
6779         for (j=0; j<M; j++){
6780           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6781                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6782                          // (assuming that we start from an aligned address).
6783           }
6784
6785      We therefore have to use the unoptimized realignment scheme:
6786
6787       for (i=0; i<N; i+=4)
6788           for (j=k; j<M; j+=4)
6789           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6790                            // that the misalignment of the initial address is
6791                            // 0).
6792
6793      The loop can then be vectorized as follows:
6794
6795       for (k=0; k<4; k++){
6796         rt = get_realignment_token (&vp[k]);
6797         for (i=0; i<N; i+=4){
6798           v1 = vp[i+k];
6799           for (j=k; j<M; j+=4){
6800             v2 = vp[i+j+VS-1];
6801             va = REALIGN_LOAD <v1,v2,rt>;
6802             vs += va;
6803             v1 = v2;
6804           }
6805         }
6806     } */
6807
6808   if (DR_IS_READ (dr))
6809     {
6810       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6811           && (!targetm.vectorize.builtin_mask_for_load
6812               || targetm.vectorize.builtin_mask_for_load ()))
6813         {
6814           /* If we are doing SLP then the accesses need not have the
6815              same alignment, instead it depends on the SLP group size.  */
6816           if (loop_vinfo
6817               && STMT_SLP_TYPE (stmt_info)
6818               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6819                               * (DR_GROUP_SIZE
6820                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6821                               TYPE_VECTOR_SUBPARTS (vectype)))
6822             ;
6823           else if (!loop_vinfo
6824                    || (nested_in_vect_loop
6825                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6826                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6827             return dr_explicit_realign;
6828           else
6829             return dr_explicit_realign_optimized;
6830         }
6831     }
6832
6833   bool is_packed = false;
6834   tree type = TREE_TYPE (DR_REF (dr));
6835   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6836     is_packed = not_size_aligned (DR_REF (dr));
6837   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6838                                                      is_packed))
6839     return dr_unaligned_supported;
6840
6841   /* Unsupported.  */
6842   return dr_unaligned_unsupported;
6843 }