gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 120 {
 121   HOST_WIDE_INT lhs, rhs;
 122
 123   /* During the analysis phase, this function is called on arbitrary
 124      statements that might not have scalar results.  */
 125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 126     return scalar_type;
 127
 128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 129
 130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 131   if (assign)
 132     {
 133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 134       if (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
 140           || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
 141           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 142         {
 143           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 144
 145           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 146           if (rhs < lhs)
 147             scalar_type = rhs_type;
 148         }
 149     }
 150   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 151     {
 152       unsigned int i = 0;
 153       if (gimple_call_internal_p (call))
 154         {
 155           internal_fn ifn = gimple_call_internal_fn (call);
 156           if (internal_load_fn_p (ifn))
 157             /* For loads the LHS type does the trick.  */
 158             i = ~0U;
 159           else if (internal_store_fn_p (ifn))
 160             {
 161               /* For stores use the tyep of the stored value.  */
 162               i = internal_fn_stored_value_index (ifn);
 163               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 164               i = ~0U;
 165             }
 166           else if (internal_fn_mask_index (ifn) == 0)
 167             i = 1;
 168         }
 169       if (i < gimple_call_num_args (call))
 170         {
 171           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 172           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 173             {
 174               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 175               if (rhs < lhs)
 176                 scalar_type = rhs_type;
 177             }
 178         }
 179     }
 180
 181   return scalar_type;
 182 }
 183
 184
 185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 186    tested at run-time.  Return TRUE if DDR was successfully inserted.
 187    Return false if versioning is not supported.  */
 188
 189 static opt_result
 190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 191 {
 192   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 193
 194   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 195     return opt_result::failure_at (vect_location,
 196                                    "will not create alias checks, as"
 197                                    " --param vect-max-version-for-alias-checks"
 198                                    " == 0\n");
 199
 200   opt_result res
 201     = runtime_alias_check_p (ddr, loop,
 202                              optimize_loop_nest_for_speed_p (loop));
 203   if (!res)
 204     return res;
 205
 206   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 207   return opt_result::success ();
 208 }
 209
 210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 211
 212 static void
 213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 214 {
 215   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 216   for (unsigned int i = 0; i < checks.length(); ++i)
 217     if (checks[i] == value)
 218       return;
 219
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location,
 222                      "need run-time check that %T is nonzero\n",
 223                      value);
 224   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 225 }
 226
 227 /* Return true if we know that the order of vectorized DR_INFO_A and
 228    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 229    DR_INFO_B.  At least one of the accesses is a write.  */
 230
 231 static bool
 232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 233 {
 234   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 235   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 236
 237   /* Single statements are always kept in their original order.  */
 238   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 239       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 240     return true;
 241
 242   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 243      emitted at the position of the first scalar load.
 244      Stores in a group are emitted at the position of the last scalar store.
 245      Compute that position and check whether the resulting order matches
 246      the current one.  */
 247   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 248   if (il_a)
 249     {
 250       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 251         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 252              s = DR_GROUP_NEXT_ELEMENT (s))
 253           il_a = get_later_stmt (il_a, s);
 254       else /* DR_IS_READ */
 255         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 256              s = DR_GROUP_NEXT_ELEMENT (s))
 257           if (get_later_stmt (il_a, s) == il_a)
 258             il_a = s;
 259     }
 260   else
 261     il_a = stmtinfo_a;
 262   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 263   if (il_b)
 264     {
 265       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 266         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 267              s = DR_GROUP_NEXT_ELEMENT (s))
 268           il_b = get_later_stmt (il_b, s);
 269       else /* DR_IS_READ */
 270         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 271              s = DR_GROUP_NEXT_ELEMENT (s))
 272           if (get_later_stmt (il_b, s) == il_b)
 273             il_b = s;
 274     }
 275   else
 276     il_b = stmtinfo_b;
 277   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 278   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 279 }
 280
 281 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 282    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 283    distances.  These distances are conservatively correct but they don't
 284    reflect a guaranteed dependence.
 285
 286    Return true if this function does all the work necessary to avoid
 287    an alias or false if the caller should use the dependence distances
 288    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 289    the depth of the loop described by LOOP_VINFO and the other arguments
 290    are as for vect_analyze_data_ref_dependence.  */
 291
 292 static bool
 293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 294                                        loop_vec_info loop_vinfo,
 295                                        int loop_depth, unsigned int *max_vf)
 296 {
 297   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 298   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 299     {
 300       int dist = dist_v[loop_depth];
 301       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 302         {
 303           /* If the user asserted safelen >= DIST consecutive iterations
 304              can be executed concurrently, assume independence.
 305
 306              ??? An alternative would be to add the alias check even
 307              in this case, and vectorize the fallback loop with the
 308              maximum VF set to safelen.  However, if the user has
 309              explicitly given a length, it's less likely that that
 310              would be a win.  */
 311           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 312             {
 313               if ((unsigned int) loop->safelen < *max_vf)
 314                 *max_vf = loop->safelen;
 315               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 316               continue;
 317             }
 318
 319           /* For dependence distances of 2 or more, we have the option
 320              of limiting VF or checking for an alias at runtime.
 321              Prefer to check at runtime if we can, to avoid limiting
 322              the VF unnecessarily when the bases are in fact independent.
 323
 324              Note that the alias checks will be removed if the VF ends up
 325              being small enough.  */
 326           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 327           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 328           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 329                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 330                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 331         }
 332     }
 333   return true;
 334 }
 335
 336
 337 /* Function vect_analyze_data_ref_dependence.
 338
 339    FIXME: I needed to change the sense of the returned flag.
 340
 341    Return FALSE if there (might) exist a dependence between a memory-reference
 342    DRA and a memory-reference DRB.  When versioning for alias may check a
 343    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 344    the data dependence.  */
 345
 346 static opt_result
 347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 348                                   loop_vec_info loop_vinfo,
 349                                   unsigned int *max_vf)
 350 {
 351   unsigned int i;
 352   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 353   struct data_reference *dra = DDR_A (ddr);
 354   struct data_reference *drb = DDR_B (ddr);
 355   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 356   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 357   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 358   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 359   lambda_vector dist_v;
 360   unsigned int loop_depth;
 361
 362   /* In loop analysis all data references should be vectorizable.  */
 363   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 364       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 365     gcc_unreachable ();
 366
 367   /* Independent data accesses.  */
 368   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 369     return opt_result::success ();
 370
 371   if (dra == drb
 372       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 373     return opt_result::success ();
 374
 375   /* We do not have to consider dependences between accesses that belong
 376      to the same group, unless the stride could be smaller than the
 377      group size.  */
 378   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 379       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 380           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 381       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 382     return opt_result::success ();
 383
 384   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 385      least two scalar iterations, there is always also a true dependence.
 386      As the vectorizer does not re-order loads and stores we can ignore
 387      the anti-dependence if TBAA can disambiguate both DRs similar to the
 388      case with known negative distance anti-dependences (positive
 389      distance anti-dependences would violate TBAA constraints).  */
 390   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 391        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 392       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 393                                  get_alias_set (DR_REF (drb))))
 394     return opt_result::success ();
 395
 396   /* Unknown data dependence.  */
 397   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 398     {
 399       /* If user asserted safelen consecutive iterations can be
 400          executed concurrently, assume independence.  */
 401       if (loop->safelen >= 2)
 402         {
 403           if ((unsigned int) loop->safelen < *max_vf)
 404             *max_vf = loop->safelen;
 405           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 406           return opt_result::success ();
 407         }
 408
 409       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 410           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 411         return opt_result::failure_at
 412           (stmtinfo_a->stmt,
 413            "versioning for alias not supported for: "
 414            "can't determine dependence between %T and %T\n",
 415            DR_REF (dra), DR_REF (drb));
 416
 417       if (dump_enabled_p ())
 418         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 419                          "versioning for alias required: "
 420                          "can't determine dependence between %T and %T\n",
 421                          DR_REF (dra), DR_REF (drb));
 422
 423       /* Add to list of ddrs that need to be tested at run-time.  */
 424       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 425     }
 426
 427   /* Known data dependence.  */
 428   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 429     {
 430       /* If user asserted safelen consecutive iterations can be
 431          executed concurrently, assume independence.  */
 432       if (loop->safelen >= 2)
 433         {
 434           if ((unsigned int) loop->safelen < *max_vf)
 435             *max_vf = loop->safelen;
 436           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 437           return opt_result::success ();
 438         }
 439
 440       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 441           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 442         return opt_result::failure_at
 443           (stmtinfo_a->stmt,
 444            "versioning for alias not supported for: "
 445            "bad dist vector for %T and %T\n",
 446            DR_REF (dra), DR_REF (drb));
 447
 448       if (dump_enabled_p ())
 449         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 450                          "versioning for alias required: "
 451                          "bad dist vector for %T and %T\n",
 452                          DR_REF (dra), DR_REF (drb));
 453       /* Add to list of ddrs that need to be tested at run-time.  */
 454       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 455     }
 456
 457   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 458
 459   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 460       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 461                                                 loop_depth, max_vf))
 462     return opt_result::success ();
 463
 464   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 465     {
 466       int dist = dist_v[loop_depth];
 467
 468       if (dump_enabled_p ())
 469         dump_printf_loc (MSG_NOTE, vect_location,
 470                          "dependence distance  = %d.\n", dist);
 471
 472       if (dist == 0)
 473         {
 474           if (dump_enabled_p ())
 475             dump_printf_loc (MSG_NOTE, vect_location,
 476                              "dependence distance == 0 between %T and %T\n",
 477                              DR_REF (dra), DR_REF (drb));
 478
 479           /* When we perform grouped accesses and perform implicit CSE
 480              by detecting equal accesses and doing disambiguation with
 481              runtime alias tests like for
 482                 .. = a[i];
 483                 .. = a[i+1];
 484                 a[i] = ..;
 485                 a[i+1] = ..;
 486                 *p = ..;
 487                 .. = a[i];
 488                 .. = a[i+1];
 489              where we will end up loading { a[i], a[i+1] } once, make
 490              sure that inserting group loads before the first load and
 491              stores after the last store will do the right thing.
 492              Similar for groups like
 493                 a[i] = ...;
 494                 ... = a[i];
 495                 a[i+1] = ...;
 496              where loads from the group interleave with the store.  */
 497           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 498             return opt_result::failure_at (stmtinfo_a->stmt,
 499                                            "READ_WRITE dependence"
 500                                            " in interleaving.\n");
 501
 502           if (loop->safelen < 2)
 503             {
 504               tree indicator = dr_zero_step_indicator (dra);
 505               if (!indicator || integer_zerop (indicator))
 506                 return opt_result::failure_at (stmtinfo_a->stmt,
 507                                                "access also has a zero step\n");
 508               else if (TREE_CODE (indicator) != INTEGER_CST)
 509                 vect_check_nonzero_value (loop_vinfo, indicator);
 510             }
 511           continue;
 512         }
 513
 514       if (dist > 0 && DDR_REVERSED_P (ddr))
 515         {
 516           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 517              reversed (to make distance vector positive), and the actual
 518              distance is negative.  */
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "dependence distance negative.\n");
 522           /* When doing outer loop vectorization, we need to check if there is
 523              a backward dependence at the inner loop level if the dependence
 524              at the outer loop is reversed.  See PR81740.  */
 525           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 526               || nested_in_vect_loop_p (loop, stmtinfo_b))
 527             {
 528               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 529                                                          DDR_LOOP_NEST (ddr));
 530               if (dist_v[inner_depth] < 0)
 531                 return opt_result::failure_at (stmtinfo_a->stmt,
 532                                                "not vectorized, dependence "
 533                                                "between data-refs %T and %T\n",
 534                                                DR_REF (dra), DR_REF (drb));
 535             }
 536           /* Record a negative dependence distance to later limit the
 537              amount of stmt copying / unrolling we can perform.
 538              Only need to handle read-after-write dependence.  */
 539           if (DR_IS_READ (drb)
 540               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 541                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 542             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 543           continue;
 544         }
 545
 546       unsigned int abs_dist = abs (dist);
 547       if (abs_dist >= 2 && abs_dist < *max_vf)
 548         {
 549           /* The dependence distance requires reduction of the maximal
 550              vectorization factor.  */
 551           *max_vf = abs_dist;
 552           if (dump_enabled_p ())
 553             dump_printf_loc (MSG_NOTE, vect_location,
 554                              "adjusting maximal vectorization factor to %i\n",
 555                              *max_vf);
 556         }
 557
 558       if (abs_dist >= *max_vf)
 559         {
 560           /* Dependence distance does not create dependence, as far as
 561              vectorization is concerned, in this case.  */
 562           if (dump_enabled_p ())
 563             dump_printf_loc (MSG_NOTE, vect_location,
 564                              "dependence distance >= VF.\n");
 565           continue;
 566         }
 567
 568       return opt_result::failure_at (stmtinfo_a->stmt,
 569                                      "not vectorized, possible dependence "
 570                                      "between data-refs %T and %T\n",
 571                                      DR_REF (dra), DR_REF (drb));
 572     }
 573
 574   return opt_result::success ();
 575 }
 576
 577 /* Function vect_analyze_data_ref_dependences.
 578
 579    Examine all the data references in the loop, and make sure there do not
 580    exist any data dependences between them.  Set *MAX_VF according to
 581    the maximum vectorization factor the data dependences allow.  */
 582
 583 opt_result
 584 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 585                                    unsigned int *max_vf)
 586 {
 587   unsigned int i;
 588   struct data_dependence_relation *ddr;
 589
 590   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 591
 592   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 593     {
 594       LOOP_VINFO_DDRS (loop_vinfo)
 595         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 596                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 597       /* We do not need read-read dependences.  */
 598       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 599                                           &LOOP_VINFO_DDRS (loop_vinfo),
 600                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 601                                           false);
 602       gcc_assert (res);
 603     }
 604
 605   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 606
 607   /* For epilogues we either have no aliases or alias versioning
 608      was applied to original loop.  Therefore we may just get max_vf
 609      using VF of original loop.  */
 610   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 611     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 612   else
 613     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 614       {
 615         opt_result res
 616           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 617         if (!res)
 618           return res;
 619       }
 620
 621   return opt_result::success ();
 622 }
 623
 624
 625 /* Function vect_slp_analyze_data_ref_dependence.
 626
 627    Return TRUE if there (might) exist a dependence between a memory-reference
 628    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 629    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 630    according to the data dependence.  */
 631
 632 static bool
 633 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 634                                       struct data_dependence_relation *ddr)
 635 {
 636   struct data_reference *dra = DDR_A (ddr);
 637   struct data_reference *drb = DDR_B (ddr);
 638   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 639   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 640
 641   /* We need to check dependences of statements marked as unvectorizable
 642      as well, they still can prohibit vectorization.  */
 643
 644   /* Independent data accesses.  */
 645   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 646     return false;
 647
 648   if (dra == drb)
 649     return false;
 650
 651   /* Read-read is OK.  */
 652   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 653     return false;
 654
 655   /* If dra and drb are part of the same interleaving chain consider
 656      them independent.  */
 657   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 658       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 659           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 660     return false;
 661
 662   /* Unknown data dependence.  */
 663   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 664     {
 665       if  (dump_enabled_p ())
 666         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 667                          "can't determine dependence between %T and %T\n",
 668                          DR_REF (dra), DR_REF (drb));
 669     }
 670   else if (dump_enabled_p ())
 671     dump_printf_loc (MSG_NOTE, vect_location,
 672                      "determined dependence between %T and %T\n",
 673                      DR_REF (dra), DR_REF (drb));
 674
 675   return true;
 676 }
 677
 678
 679 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 680    contain the vector of scalar stores of this instance if we are
 681    disambiguating the loads.  */
 682
 683 static bool
 684 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 685                                    vec<stmt_vec_info> stores,
 686                                    stmt_vec_info last_store_info)
 687 {
 688   /* This walks over all stmts involved in the SLP load/store done
 689      in NODE verifying we can sink them up to the last stmt in the
 690      group.  */
 691   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
 692     {
 693       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 694       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 695         {
 696           stmt_vec_info access_info
 697             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 698           if (access_info == last_access_info)
 699             continue;
 700           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 701           ao_ref ref;
 702           bool ref_initialized_p = false;
 703           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 704                gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 705             {
 706               gimple *stmt = gsi_stmt (gsi);
 707               if (! gimple_vuse (stmt))
 708                 continue;
 709
 710               /* If we couldn't record a (single) data reference for this
 711                  stmt we have to resort to the alias oracle.  */
 712               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 713               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 714               if (!dr_b)
 715                 {
 716                   /* We are moving a store - this means
 717                      we cannot use TBAA for disambiguation.  */
 718                   if (!ref_initialized_p)
 719                     ao_ref_init (&ref, DR_REF (dr_a));
 720                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 721                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 722                     return false;
 723                   continue;
 724                 }
 725
 726               bool dependent = false;
 727               /* If we run into a store of this same instance (we've just
 728                  marked those) then delay dependence checking until we run
 729                  into the last store because this is where it will have
 730                  been sunk to (and we verify if we can do that as well).  */
 731               if (gimple_visited_p (stmt))
 732                 {
 733                   if (stmt_info != last_store_info)
 734                     continue;
 735
 736                   for (stmt_vec_info &store_info : stores)
 737                     {
 738                       data_reference *store_dr
 739                         = STMT_VINFO_DATA_REF (store_info);
 740                       ddr_p ddr = initialize_data_dependence_relation
 741                                     (dr_a, store_dr, vNULL);
 742                       dependent
 743                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 744                       free_dependence_relation (ddr);
 745                       if (dependent)
 746                         break;
 747                     }
 748                 }
 749               else
 750                 {
 751                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 752                                                                    dr_b, vNULL);
 753                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 754                   free_dependence_relation (ddr);
 755                 }
 756               if (dependent)
 757                 return false;
 758             }
 759         }
 760     }
 761   else /* DR_IS_READ */
 762     {
 763       stmt_vec_info first_access_info
 764         = vect_find_first_scalar_stmt_in_slp (node);
 765       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 766         {
 767           stmt_vec_info access_info
 768             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 769           if (access_info == first_access_info)
 770             continue;
 771           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 772           ao_ref ref;
 773           bool ref_initialized_p = false;
 774           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 775                gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 776             {
 777               gimple *stmt = gsi_stmt (gsi);
 778               if (! gimple_vdef (stmt))
 779                 continue;
 780
 781               /* If we couldn't record a (single) data reference for this
 782                  stmt we have to resort to the alias oracle.  */
 783               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 784               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 785
 786               /* We are hoisting a load - this means we can use
 787                  TBAA for disambiguation.  */
 788               if (!ref_initialized_p)
 789                 ao_ref_init (&ref, DR_REF (dr_a));
 790               if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
 791                 {
 792                   if (!dr_b)
 793                     return false;
 794                   /* Resort to dependence checking below.  */
 795                 }
 796               else
 797                 /* No dependence.  */
 798                 continue;
 799
 800               bool dependent = false;
 801               /* If we run into a store of this same instance (we've just
 802                  marked those) then delay dependence checking until we run
 803                  into the last store because this is where it will have
 804                  been sunk to (and we verify if we can do that as well).  */
 805               if (gimple_visited_p (stmt))
 806                 {
 807                   if (stmt_info != last_store_info)
 808                     continue;
 809
 810                   for (stmt_vec_info &store_info : stores)
 811                     {
 812                       data_reference *store_dr
 813                         = STMT_VINFO_DATA_REF (store_info);
 814                       ddr_p ddr = initialize_data_dependence_relation
 815                                     (dr_a, store_dr, vNULL);
 816                       dependent
 817                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 818                       free_dependence_relation (ddr);
 819                       if (dependent)
 820                         break;
 821                     }
 822                 }
 823               else
 824                 {
 825                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 826                                                                    dr_b, vNULL);
 827                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 828                   free_dependence_relation (ddr);
 829                 }
 830               if (dependent)
 831                 return false;
 832             }
 833         }
 834     }
 835   return true;
 836 }
 837
 838
 839 /* Function vect_analyze_data_ref_dependences.
 840
 841    Examine all the data references in the basic-block, and make sure there
 842    do not exist any data dependences between them.  Set *MAX_VF according to
 843    the maximum vectorization factor the data dependences allow.  */
 844
 845 bool
 846 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 847 {
 848   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 849
 850   /* The stores of this instance are at the root of the SLP tree.  */
 851   slp_tree store = NULL;
 852   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
 853     store = SLP_INSTANCE_TREE (instance);
 854
 855   /* Verify we can sink stores to the vectorized stmt insert location.  */
 856   stmt_vec_info last_store_info = NULL;
 857   if (store)
 858     {
 859       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 860         return false;
 861
 862       /* Mark stores in this instance and remember the last one.  */
 863       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 864       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 865         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 866     }
 867
 868   bool res = true;
 869
 870   /* Verify we can sink loads to the vectorized stmt insert location,
 871      special-casing stores of this instance.  */
 872   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
 873     if (! vect_slp_analyze_node_dependences (vinfo, load,
 874                                              store
 875                                              ? SLP_TREE_SCALAR_STMTS (store)
 876                                              : vNULL, last_store_info))
 877       {
 878         res = false;
 879         break;
 880       }
 881
 882   /* Unset the visited flag.  */
 883   if (store)
 884     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 885       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 886
 887   return res;
 888 }
 889
 890 /* Record the base alignment guarantee given by DRB, which occurs
 891    in STMT_INFO.  */
 892
 893 static void
 894 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 895                             innermost_loop_behavior *drb)
 896 {
 897   bool existed;
 898   innermost_loop_behavior *&entry
 899     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 900   if (!existed || entry->base_alignment < drb->base_alignment)
 901     {
 902       entry = drb;
 903       if (dump_enabled_p ())
 904         dump_printf_loc (MSG_NOTE, vect_location,
 905                          "recording new base alignment for %T\n"
 906                          "  alignment:    %d\n"
 907                          "  misalignment: %d\n"
 908                          "  based on:     %G",
 909                          drb->base_address,
 910                          drb->base_alignment,
 911                          drb->base_misalignment,
 912                          stmt_info->stmt);
 913     }
 914 }
 915
 916 /* If the region we're going to vectorize is reached, all unconditional
 917    data references occur at least once.  We can therefore pool the base
 918    alignment guarantees from each unconditional reference.  Do this by
 919    going through all the data references in VINFO and checking whether
 920    the containing statement makes the reference unconditionally.  If so,
 921    record the alignment of the base address in VINFO so that it can be
 922    used for all other references with the same base.  */
 923
 924 void
 925 vect_record_base_alignments (vec_info *vinfo)
 926 {
 927   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 928   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 929   for (data_reference *dr : vinfo->shared->datarefs)
 930     {
 931       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 932       stmt_vec_info stmt_info = dr_info->stmt;
 933       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 934           && STMT_VINFO_VECTORIZABLE (stmt_info)
 935           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 936         {
 937           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 938
 939           /* If DR is nested in the loop that is being vectorized, we can also
 940              record the alignment of the base wrt the outer loop.  */
 941           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 942             vect_record_base_alignment
 943               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 944         }
 945     }
 946 }
 947
 948 /* Return the target alignment for the vectorized form of DR_INFO.  */
 949
 950 static poly_uint64
 951 vect_calculate_target_alignment (dr_vec_info *dr_info)
 952 {
 953   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 954   return targetm.vectorize.preferred_vector_alignment (vectype);
 955 }
 956
 957 /* Function vect_compute_data_ref_alignment
 958
 959    Compute the misalignment of the data reference DR_INFO.
 960
 961    Output:
 962    1. DR_MISALIGNMENT (DR_INFO) is defined.
 963
 964    FOR NOW: No analysis is actually performed. Misalignment is calculated
 965    only for trivial cases. TODO.  */
 966
 967 static void
 968 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
 969 {
 970   stmt_vec_info stmt_info = dr_info->stmt;
 971   vec_base_alignments *base_alignments = &vinfo->base_alignments;
 972   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 973   class loop *loop = NULL;
 974   tree ref = DR_REF (dr_info->dr);
 975   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 976
 977   if (dump_enabled_p ())
 978     dump_printf_loc (MSG_NOTE, vect_location,
 979                      "vect_compute_data_ref_alignment:\n");
 980
 981   if (loop_vinfo)
 982     loop = LOOP_VINFO_LOOP (loop_vinfo);
 983
 984   /* Initialize misalignment to unknown.  */
 985   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 986
 987   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 988     return;
 989
 990   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
 991   bool step_preserves_misalignment_p;
 992
 993   poly_uint64 vector_alignment
 994     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 995   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 996
 997   /* If the main loop has peeled for alignment we have no way of knowing
 998      whether the data accesses in the epilogues are aligned.  We can't at
 999      compile time answer the question whether we have entered the main loop or
1000      not.  Fixes PR 92351.  */
1001   if (loop_vinfo)
1002     {
1003       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1004       if (orig_loop_vinfo
1005           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1006         return;
1007     }
1008
1009   unsigned HOST_WIDE_INT vect_align_c;
1010   if (!vector_alignment.is_constant (&vect_align_c))
1011     return;
1012
1013   /* No step for BB vectorization.  */
1014   if (!loop)
1015     {
1016       gcc_assert (integer_zerop (drb->step));
1017       step_preserves_misalignment_p = true;
1018     }
1019
1020   /* In case the dataref is in an inner-loop of the loop that is being
1021      vectorized (LOOP), we use the base and misalignment information
1022      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1023      stays the same throughout the execution of the inner-loop, which is why
1024      we have to check that the stride of the dataref in the inner-loop evenly
1025      divides by the vector alignment.  */
1026   else if (nested_in_vect_loop_p (loop, stmt_info))
1027     {
1028       step_preserves_misalignment_p
1029         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1030
1031       if (dump_enabled_p ())
1032         {
1033           if (step_preserves_misalignment_p)
1034             dump_printf_loc (MSG_NOTE, vect_location,
1035                              "inner step divides the vector alignment.\n");
1036           else
1037             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038                              "inner step doesn't divide the vector"
1039                              " alignment.\n");
1040         }
1041     }
1042
1043   /* Similarly we can only use base and misalignment information relative to
1044      an innermost loop if the misalignment stays the same throughout the
1045      execution of the loop.  As above, this is the case if the stride of
1046      the dataref evenly divides by the alignment.  */
1047   else
1048     {
1049       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1050       step_preserves_misalignment_p
1051         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1052
1053       if (!step_preserves_misalignment_p && dump_enabled_p ())
1054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055                          "step doesn't divide the vector alignment.\n");
1056     }
1057
1058   unsigned int base_alignment = drb->base_alignment;
1059   unsigned int base_misalignment = drb->base_misalignment;
1060
1061   /* Calculate the maximum of the pooled base address alignment and the
1062      alignment that we can compute for DR itself.  */
1063   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1064   if (entry && base_alignment < (*entry)->base_alignment)
1065     {
1066       base_alignment = (*entry)->base_alignment;
1067       base_misalignment = (*entry)->base_misalignment;
1068     }
1069
1070   if (drb->offset_alignment < vect_align_c
1071       || !step_preserves_misalignment_p
1072       /* We need to know whether the step wrt the vectorized loop is
1073          negative when computing the starting misalignment below.  */
1074       || TREE_CODE (drb->step) != INTEGER_CST)
1075     {
1076       if (dump_enabled_p ())
1077         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1078                          "Unknown alignment for access: %T\n", ref);
1079       return;
1080     }
1081
1082   if (base_alignment < vect_align_c)
1083     {
1084       unsigned int max_alignment;
1085       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1086       if (max_alignment < vect_align_c
1087           || !vect_can_force_dr_alignment_p (base,
1088                                              vect_align_c * BITS_PER_UNIT))
1089         {
1090           if (dump_enabled_p ())
1091             dump_printf_loc (MSG_NOTE, vect_location,
1092                              "can't force alignment of ref: %T\n", ref);
1093           return;
1094         }
1095
1096       /* Force the alignment of the decl.
1097          NOTE: This is the only change to the code we make during
1098          the analysis phase, before deciding to vectorize the loop.  */
1099       if (dump_enabled_p ())
1100         dump_printf_loc (MSG_NOTE, vect_location,
1101                          "force alignment of %T\n", ref);
1102
1103       dr_info->base_decl = base;
1104       dr_info->base_misaligned = true;
1105       base_misalignment = 0;
1106     }
1107   poly_int64 misalignment
1108     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1109
1110   /* If this is a backward running DR then first access in the larger
1111      vectype actually is N-1 elements before the address in the DR.
1112      Adjust misalign accordingly.  */
1113   if (tree_int_cst_sgn (drb->step) < 0)
1114     /* PLUS because STEP is negative.  */
1115     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1116                      * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1117
1118   unsigned int const_misalignment;
1119   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1120     {
1121       if (dump_enabled_p ())
1122         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123                          "Non-constant misalignment for access: %T\n", ref);
1124       return;
1125     }
1126
1127   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1128
1129   if (dump_enabled_p ())
1130     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1131                      "misalign = %d bytes of ref %T\n",
1132                      DR_MISALIGNMENT (dr_info), ref);
1133
1134   return;
1135 }
1136
1137 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1138    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1139    is made aligned via peeling.  */
1140
1141 static bool
1142 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1143                                          dr_vec_info *dr_peel_info)
1144 {
1145   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1146                   DR_TARGET_ALIGNMENT (dr_info)))
1147     {
1148       poly_offset_int diff
1149         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1150            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1151       if (known_eq (diff, 0)
1152           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1153         return true;
1154     }
1155   return false;
1156 }
1157
1158 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1159    aligned via peeling.  */
1160
1161 static bool
1162 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1163                                  dr_vec_info *dr_peel_info)
1164 {
1165   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1166                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1167       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1168                            DR_OFFSET (dr_peel_info->dr), 0)
1169       || !operand_equal_p (DR_STEP (dr_info->dr),
1170                            DR_STEP (dr_peel_info->dr), 0))
1171     return false;
1172
1173   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1174 }
1175
1176 /* Function vect_update_misalignment_for_peel.
1177    Sets DR_INFO's misalignment
1178    - to 0 if it has the same alignment as DR_PEEL_INFO,
1179    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1180    - to -1 (unknown) otherwise.
1181
1182    DR_INFO - the data reference whose misalignment is to be adjusted.
1183    DR_PEEL_INFO - the data reference whose misalignment is being made
1184                   zero in the vector loop by the peel.
1185    NPEEL - the number of iterations in the peel loop if the misalignment
1186            of DR_PEEL_INFO is known at compile time.  */
1187
1188 static void
1189 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1190                                    dr_vec_info *dr_peel_info, int npeel)
1191 {
1192   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1193   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1194     {
1195       SET_DR_MISALIGNMENT (dr_info, 0);
1196       return;
1197     }
1198
1199   unsigned HOST_WIDE_INT alignment;
1200   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1201       && known_alignment_for_access_p (dr_info)
1202       && known_alignment_for_access_p (dr_peel_info))
1203     {
1204       int misal = DR_MISALIGNMENT (dr_info);
1205       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1206       misal &= alignment - 1;
1207       SET_DR_MISALIGNMENT (dr_info, misal);
1208       return;
1209     }
1210
1211   if (dump_enabled_p ())
1212     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1213                      "to unknown (-1).\n");
1214   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1215 }
1216
1217 /* Return true if alignment is relevant for DR_INFO.  */
1218
1219 static bool
1220 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1221 {
1222   stmt_vec_info stmt_info = dr_info->stmt;
1223
1224   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1225     return false;
1226
1227   /* For interleaving, only the alignment of the first access matters.  */
1228   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1229       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1230     return false;
1231
1232   /* Scatter-gather and invariant accesses continue to address individual
1233      scalars, so vector-level alignment is irrelevant.  */
1234   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1235       || integer_zerop (DR_STEP (dr_info->dr)))
1236     return false;
1237
1238   /* Strided accesses perform only component accesses, alignment is
1239      irrelevant for them.  */
1240   if (STMT_VINFO_STRIDED_P (stmt_info)
1241       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1242     return false;
1243
1244   return true;
1245 }
1246
1247 /* Given an memory reference EXP return whether its alignment is less
1248    than its size.  */
1249
1250 static bool
1251 not_size_aligned (tree exp)
1252 {
1253   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1254     return true;
1255
1256   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1257           > get_object_alignment (exp));
1258 }
1259
1260 /* Function vector_alignment_reachable_p
1261
1262    Return true if vector alignment for DR_INFO is reachable by peeling
1263    a few loop iterations.  Return false otherwise.  */
1264
1265 static bool
1266 vector_alignment_reachable_p (dr_vec_info *dr_info)
1267 {
1268   stmt_vec_info stmt_info = dr_info->stmt;
1269   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1270
1271   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1272     {
1273       /* For interleaved access we peel only if number of iterations in
1274          the prolog loop ({VF - misalignment}), is a multiple of the
1275          number of the interleaved accesses.  */
1276       int elem_size, mis_in_elements;
1277
1278       /* FORNOW: handle only known alignment.  */
1279       if (!known_alignment_for_access_p (dr_info))
1280         return false;
1281
1282       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1283       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1284       elem_size = vector_element_size (vector_size, nelements);
1285       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1286
1287       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1288         return false;
1289     }
1290
1291   /* If misalignment is known at the compile time then allow peeling
1292      only if natural alignment is reachable through peeling.  */
1293   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1294     {
1295       HOST_WIDE_INT elmsize =
1296                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1297       if (dump_enabled_p ())
1298         {
1299           dump_printf_loc (MSG_NOTE, vect_location,
1300                            "data size = %wd. misalignment = %d.\n", elmsize,
1301                            DR_MISALIGNMENT (dr_info));
1302         }
1303       if (DR_MISALIGNMENT (dr_info) % elmsize)
1304         {
1305           if (dump_enabled_p ())
1306             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1307                              "data size does not divide the misalignment.\n");
1308           return false;
1309         }
1310     }
1311
1312   if (!known_alignment_for_access_p (dr_info))
1313     {
1314       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1315       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1316       if (dump_enabled_p ())
1317         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1318                          "Unknown misalignment, %snaturally aligned\n",
1319                          is_packed ? "not " : "");
1320       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1321     }
1322
1323   return true;
1324 }
1325
1326
1327 /* Calculate the cost of the memory access represented by DR_INFO.  */
1328
1329 static void
1330 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1331                            unsigned int *inside_cost,
1332                            unsigned int *outside_cost,
1333                            stmt_vector_for_cost *body_cost_vec,
1334                            stmt_vector_for_cost *prologue_cost_vec)
1335 {
1336   stmt_vec_info stmt_info = dr_info->stmt;
1337   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1338   int ncopies;
1339
1340   if (PURE_SLP_STMT (stmt_info))
1341     ncopies = 1;
1342   else
1343     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1344
1345   if (DR_IS_READ (dr_info->dr))
1346     vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1347                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1348   else
1349     vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1350
1351   if (dump_enabled_p ())
1352     dump_printf_loc (MSG_NOTE, vect_location,
1353                      "vect_get_data_access_cost: inside_cost = %d, "
1354                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1355 }
1356
1357
1358 typedef struct _vect_peel_info
1359 {
1360   dr_vec_info *dr_info;
1361   int npeel;
1362   unsigned int count;
1363 } *vect_peel_info;
1364
1365 typedef struct _vect_peel_extended_info
1366 {
1367   vec_info *vinfo;
1368   struct _vect_peel_info peel_info;
1369   unsigned int inside_cost;
1370   unsigned int outside_cost;
1371 } *vect_peel_extended_info;
1372
1373
1374 /* Peeling hashtable helpers.  */
1375
1376 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1377 {
1378   static inline hashval_t hash (const _vect_peel_info *);
1379   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1380 };
1381
1382 inline hashval_t
1383 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1384 {
1385   return (hashval_t) peel_info->npeel;
1386 }
1387
1388 inline bool
1389 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1390 {
1391   return (a->npeel == b->npeel);
1392 }
1393
1394
1395 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1396
1397 static void
1398 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1399                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1400                           int npeel)
1401 {
1402   struct _vect_peel_info elem, *slot;
1403   _vect_peel_info **new_slot;
1404   bool supportable_dr_alignment
1405     = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1406
1407   elem.npeel = npeel;
1408   slot = peeling_htab->find (&elem);
1409   if (slot)
1410     slot->count++;
1411   else
1412     {
1413       slot = XNEW (struct _vect_peel_info);
1414       slot->npeel = npeel;
1415       slot->dr_info = dr_info;
1416       slot->count = 1;
1417       new_slot = peeling_htab->find_slot (slot, INSERT);
1418       *new_slot = slot;
1419     }
1420
1421   if (!supportable_dr_alignment
1422       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1423     slot->count += VECT_MAX_COST;
1424 }
1425
1426
1427 /* Traverse peeling hash table to find peeling option that aligns maximum
1428    number of data accesses.  */
1429
1430 int
1431 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1432                                      _vect_peel_extended_info *max)
1433 {
1434   vect_peel_info elem = *slot;
1435
1436   if (elem->count > max->peel_info.count
1437       || (elem->count == max->peel_info.count
1438           && max->peel_info.npeel > elem->npeel))
1439     {
1440       max->peel_info.npeel = elem->npeel;
1441       max->peel_info.count = elem->count;
1442       max->peel_info.dr_info = elem->dr_info;
1443     }
1444
1445   return 1;
1446 }
1447
1448 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1449    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1450    we assume DR0_INFO's misalignment will be zero after peeling.  */
1451
1452 static void
1453 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1454                                 dr_vec_info *dr0_info,
1455                                 unsigned int *inside_cost,
1456                                 unsigned int *outside_cost,
1457                                 stmt_vector_for_cost *body_cost_vec,
1458                                 stmt_vector_for_cost *prologue_cost_vec,
1459                                 unsigned int npeel,
1460                                 bool unknown_misalignment)
1461 {
1462   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1463
1464   for (data_reference *dr : datarefs)
1465     {
1466       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1467       if (!vect_relevant_for_alignment_p (dr_info))
1468         continue;
1469
1470       int save_misalignment;
1471       save_misalignment = DR_MISALIGNMENT (dr_info);
1472       if (npeel == 0)
1473         ;
1474       else if (unknown_misalignment && dr_info == dr0_info)
1475         SET_DR_MISALIGNMENT (dr_info, 0);
1476       else
1477         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1478       vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1479                                  body_cost_vec, prologue_cost_vec);
1480       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1481     }
1482 }
1483
1484 /* Traverse peeling hash table and calculate cost for each peeling option.
1485    Find the one with the lowest cost.  */
1486
1487 int
1488 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1489                                    _vect_peel_extended_info *min)
1490 {
1491   vect_peel_info elem = *slot;
1492   int dummy;
1493   unsigned int inside_cost = 0, outside_cost = 0;
1494   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1495   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1496                        epilogue_cost_vec;
1497
1498   prologue_cost_vec.create (2);
1499   body_cost_vec.create (2);
1500   epilogue_cost_vec.create (2);
1501
1502   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1503                                   &outside_cost, &body_cost_vec,
1504                                   &prologue_cost_vec, elem->npeel, false);
1505
1506   body_cost_vec.release ();
1507
1508   outside_cost += vect_get_known_peeling_cost
1509     (loop_vinfo, elem->npeel, &dummy,
1510      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1511      &prologue_cost_vec, &epilogue_cost_vec);
1512
1513   /* Prologue and epilogue costs are added to the target model later.
1514      These costs depend only on the scalar iteration cost, the
1515      number of peeling iterations finally chosen, and the number of
1516      misaligned statements.  So discard the information found here.  */
1517   prologue_cost_vec.release ();
1518   epilogue_cost_vec.release ();
1519
1520   if (inside_cost < min->inside_cost
1521       || (inside_cost == min->inside_cost
1522           && outside_cost < min->outside_cost))
1523     {
1524       min->inside_cost = inside_cost;
1525       min->outside_cost = outside_cost;
1526       min->peel_info.dr_info = elem->dr_info;
1527       min->peel_info.npeel = elem->npeel;
1528       min->peel_info.count = elem->count;
1529     }
1530
1531   return 1;
1532 }
1533
1534
1535 /* Choose best peeling option by traversing peeling hash table and either
1536    choosing an option with the lowest cost (if cost model is enabled) or the
1537    option that aligns as many accesses as possible.  */
1538
1539 static struct _vect_peel_extended_info
1540 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1541                                        loop_vec_info loop_vinfo)
1542 {
1543    struct _vect_peel_extended_info res;
1544
1545    res.peel_info.dr_info = NULL;
1546    res.vinfo = loop_vinfo;
1547
1548    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1549      {
1550        res.inside_cost = INT_MAX;
1551        res.outside_cost = INT_MAX;
1552        peeling_htab->traverse <_vect_peel_extended_info *,
1553                                vect_peeling_hash_get_lowest_cost> (&res);
1554      }
1555    else
1556      {
1557        res.peel_info.count = 0;
1558        peeling_htab->traverse <_vect_peel_extended_info *,
1559                                vect_peeling_hash_get_most_frequent> (&res);
1560        res.inside_cost = 0;
1561        res.outside_cost = 0;
1562      }
1563
1564    return res;
1565 }
1566
1567 /* Return true if the new peeling NPEEL is supported.  */
1568
1569 static bool
1570 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1571                           unsigned npeel)
1572 {
1573   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1574   enum dr_alignment_support supportable_dr_alignment;
1575
1576   /* Ensure that all data refs can be vectorized after the peel.  */
1577   for (data_reference *dr : datarefs)
1578     {
1579       int save_misalignment;
1580
1581       if (dr == dr0_info->dr)
1582         continue;
1583
1584       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1585       if (!vect_relevant_for_alignment_p (dr_info))
1586         continue;
1587
1588       save_misalignment = DR_MISALIGNMENT (dr_info);
1589       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1590       supportable_dr_alignment
1591         = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1592       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1593
1594       if (!supportable_dr_alignment)
1595         return false;
1596     }
1597
1598   return true;
1599 }
1600
1601 /* Compare two data-references DRA and DRB to group them into chunks
1602    with related alignment.  */
1603
1604 static int
1605 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1606 {
1607   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1608   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1609   int cmp;
1610
1611   /* Stabilize sort.  */
1612   if (dra == drb)
1613     return 0;
1614
1615   /* Ordering of DRs according to base.  */
1616   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1617                                DR_BASE_ADDRESS (drb));
1618   if (cmp != 0)
1619     return cmp;
1620
1621   /* And according to DR_OFFSET.  */
1622   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1623   if (cmp != 0)
1624     return cmp;
1625
1626   /* And after step.  */
1627   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1628   if (cmp != 0)
1629     return cmp;
1630
1631   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1632   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1633   if (cmp == 0)
1634     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1635   return cmp;
1636 }
1637
1638 /* Function vect_enhance_data_refs_alignment
1639
1640    This pass will use loop versioning and loop peeling in order to enhance
1641    the alignment of data references in the loop.
1642
1643    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1644    original loop is to be vectorized.  Any other loops that are created by
1645    the transformations performed in this pass - are not supposed to be
1646    vectorized.  This restriction will be relaxed.
1647
1648    This pass will require a cost model to guide it whether to apply peeling
1649    or versioning or a combination of the two.  For example, the scheme that
1650    intel uses when given a loop with several memory accesses, is as follows:
1651    choose one memory access ('p') which alignment you want to force by doing
1652    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1653    other accesses are not necessarily aligned, or (2) use loop versioning to
1654    generate one loop in which all accesses are aligned, and another loop in
1655    which only 'p' is necessarily aligned.
1656
1657    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1658    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1659    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1660
1661    Devising a cost model is the most critical aspect of this work.  It will
1662    guide us on which access to peel for, whether to use loop versioning, how
1663    many versions to create, etc.  The cost model will probably consist of
1664    generic considerations as well as target specific considerations (on
1665    powerpc for example, misaligned stores are more painful than misaligned
1666    loads).
1667
1668    Here are the general steps involved in alignment enhancements:
1669
1670      -- original loop, before alignment analysis:
1671         for (i=0; i<N; i++){
1672           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1673           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1674         }
1675
1676      -- After vect_compute_data_refs_alignment:
1677         for (i=0; i<N; i++){
1678           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1679           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1680         }
1681
1682      -- Possibility 1: we do loop versioning:
1683      if (p is aligned) {
1684         for (i=0; i<N; i++){    # loop 1A
1685           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1686           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1687         }
1688      }
1689      else {
1690         for (i=0; i<N; i++){    # loop 1B
1691           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1692           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1693         }
1694      }
1695
1696      -- Possibility 2: we do loop peeling:
1697      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1698         x = q[i];
1699         p[i] = y;
1700      }
1701      for (i = 3; i < N; i++){   # loop 2A
1702         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1703         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1704      }
1705
1706      -- Possibility 3: combination of loop peeling and versioning:
1707      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1708         x = q[i];
1709         p[i] = y;
1710      }
1711      if (p is aligned) {
1712         for (i = 3; i<N; i++){  # loop 3A
1713           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1714           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1715         }
1716      }
1717      else {
1718         for (i = 3; i<N; i++){  # loop 3B
1719           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1720           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1721         }
1722      }
1723
1724      These loops are later passed to loop_transform to be vectorized.  The
1725      vectorizer will use the alignment information to guide the transformation
1726      (whether to generate regular loads/stores, or with special handling for
1727      misalignment).  */
1728
1729 opt_result
1730 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1731 {
1732   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1733   enum dr_alignment_support supportable_dr_alignment;
1734   dr_vec_info *first_store = NULL;
1735   dr_vec_info *dr0_info = NULL;
1736   struct data_reference *dr;
1737   unsigned int i;
1738   bool do_peeling = false;
1739   bool do_versioning = false;
1740   unsigned int npeel = 0;
1741   bool one_misalignment_known = false;
1742   bool one_misalignment_unknown = false;
1743   bool one_dr_unsupportable = false;
1744   dr_vec_info *unsupportable_dr_info = NULL;
1745   unsigned int mis, dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1746   hash_table<peel_info_hasher> peeling_htab (1);
1747
1748   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1749
1750   /* Reset data so we can safely be called multiple times.  */
1751   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1752   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1753
1754   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1755     return opt_result::success ();
1756
1757   /* Sort the vector of datarefs so DRs that have the same or dependent
1758      alignment are next to each other.  */
1759   auto_vec<data_reference_p> datarefs
1760     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1761   datarefs.qsort (dr_align_group_sort_cmp);
1762
1763   /* Compute the number of DRs that become aligned when we peel
1764      a dataref so it becomes aligned.  */
1765   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1766   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1767   unsigned i0;
1768   for (i0 = 0; i0 < datarefs.length (); ++i0)
1769     if (DR_BASE_ADDRESS (datarefs[i0]))
1770       break;
1771   for (i = i0 + 1; i <= datarefs.length (); ++i)
1772     {
1773       if (i == datarefs.length ()
1774           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1775                                DR_BASE_ADDRESS (datarefs[i]), 0)
1776           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1777                                DR_OFFSET (datarefs[i]), 0)
1778           || !operand_equal_p (DR_STEP (datarefs[i0]),
1779                                DR_STEP (datarefs[i]), 0))
1780         {
1781           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1782              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1783              will get known misalignment if we align one of the refs
1784              with the largest DR_TARGET_ALIGNMENT.  */
1785           for (unsigned j = i0; j < i; ++j)
1786             {
1787               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1788               for (unsigned k = i0; k < i; ++k)
1789                 {
1790                   if (k == j)
1791                     continue;
1792                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1793                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1794                                                                dr_infoj))
1795                     n_same_align_refs[j]++;
1796                 }
1797             }
1798           i0 = i;
1799         }
1800     }
1801
1802   /* While cost model enhancements are expected in the future, the high level
1803      view of the code at this time is as follows:
1804
1805      A) If there is a misaligned access then see if peeling to align
1806         this access can make all data references satisfy
1807         vect_supportable_dr_alignment.  If so, update data structures
1808         as needed and return true.
1809
1810      B) If peeling wasn't possible and there is a data reference with an
1811         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1812         then see if loop versioning checks can be used to make all data
1813         references satisfy vect_supportable_dr_alignment.  If so, update
1814         data structures as needed and return true.
1815
1816      C) If neither peeling nor versioning were successful then return false if
1817         any data reference does not satisfy vect_supportable_dr_alignment.
1818
1819      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1820
1821      Note, Possibility 3 above (which is peeling and versioning together) is not
1822      being done at this time.  */
1823
1824   /* (1) Peeling to force alignment.  */
1825
1826   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1827      Considerations:
1828      + How many accesses will become aligned due to the peeling
1829      - How many accesses will become unaligned due to the peeling,
1830        and the cost of misaligned accesses.
1831      - The cost of peeling (the extra runtime checks, the increase
1832        in code size).  */
1833
1834   FOR_EACH_VEC_ELT (datarefs, i, dr)
1835     {
1836       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1837       if (!vect_relevant_for_alignment_p (dr_info))
1838         continue;
1839
1840       stmt_vec_info stmt_info = dr_info->stmt;
1841       supportable_dr_alignment
1842         = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1843       do_peeling = vector_alignment_reachable_p (dr_info);
1844       if (do_peeling)
1845         {
1846           if (known_alignment_for_access_p (dr_info))
1847             {
1848               unsigned int npeel_tmp = 0;
1849               bool negative = tree_int_cst_compare (DR_STEP (dr),
1850                                                     size_zero_node) < 0;
1851
1852               /* If known_alignment_for_access_p then we have set
1853                  DR_MISALIGNMENT which is only done if we know it at compiler
1854                  time, so it is safe to assume target alignment is constant.
1855                */
1856               unsigned int target_align =
1857                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1858               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1859               mis = (negative
1860                      ? DR_MISALIGNMENT (dr_info)
1861                      : -DR_MISALIGNMENT (dr_info));
1862               if (DR_MISALIGNMENT (dr_info) != 0)
1863                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1864
1865               /* For multiple types, it is possible that the bigger type access
1866                  will have more than one peeling option.  E.g., a loop with two
1867                  types: one of size (vector size / 4), and the other one of
1868                  size (vector size / 8).  Vectorization factor will 8.  If both
1869                  accesses are misaligned by 3, the first one needs one scalar
1870                  iteration to be aligned, and the second one needs 5.  But the
1871                  first one will be aligned also by peeling 5 scalar
1872                  iterations, and in that case both accesses will be aligned.
1873                  Hence, except for the immediate peeling amount, we also want
1874                  to try to add full vector size, while we don't exceed
1875                  vectorization factor.
1876                  We do this automatically for cost model, since we calculate
1877                  cost for every peeling option.  */
1878               poly_uint64 nscalars = npeel_tmp;
1879               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1880                 {
1881                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1882                   nscalars = (STMT_SLP_TYPE (stmt_info)
1883                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1884                 }
1885
1886               /* Save info about DR in the hash table.  Also include peeling
1887                  amounts according to the explanation above.  */
1888               while (known_le (npeel_tmp, nscalars))
1889                 {
1890                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1891                                             dr_info, npeel_tmp);
1892                   npeel_tmp += MAX (1, target_align / dr_size);
1893                 }
1894
1895               one_misalignment_known = true;
1896             }
1897           else
1898             {
1899               /* If we don't know any misalignment values, we prefer
1900                  peeling for data-ref that has the maximum number of data-refs
1901                  with the same alignment, unless the target prefers to align
1902                  stores over load.  */
1903               unsigned same_align_drs = n_same_align_refs[i];
1904               if (!dr0_info
1905                   || dr0_same_align_drs < same_align_drs)
1906                 {
1907                   dr0_same_align_drs = same_align_drs;
1908                   dr0_info = dr_info;
1909                 }
1910               /* For data-refs with the same number of related
1911                  accesses prefer the one where the misalign
1912                  computation will be invariant in the outermost loop.  */
1913               else if (dr0_same_align_drs == same_align_drs)
1914                 {
1915                   class loop *ivloop0, *ivloop;
1916                   ivloop0 = outermost_invariant_loop_for_expr
1917                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1918                   ivloop = outermost_invariant_loop_for_expr
1919                     (loop, DR_BASE_ADDRESS (dr));
1920                   if ((ivloop && !ivloop0)
1921                       || (ivloop && ivloop0
1922                           && flow_loop_nested_p (ivloop, ivloop0)))
1923                     dr0_info = dr_info;
1924                 }
1925
1926               one_misalignment_unknown = true;
1927
1928               /* Check for data refs with unsupportable alignment that
1929                  can be peeled.  */
1930               if (!supportable_dr_alignment)
1931               {
1932                 one_dr_unsupportable = true;
1933                 unsupportable_dr_info = dr_info;
1934               }
1935
1936               if (!first_store && DR_IS_WRITE (dr))
1937                 {
1938                   first_store = dr_info;
1939                   first_store_same_align_drs = same_align_drs;
1940                 }
1941             }
1942         }
1943       else
1944         {
1945           if (!aligned_access_p (dr_info))
1946             {
1947               if (dump_enabled_p ())
1948                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1949                                  "vector alignment may not be reachable\n");
1950               break;
1951             }
1952         }
1953     }
1954
1955   /* Check if we can possibly peel the loop.  */
1956   if (!vect_can_advance_ivs_p (loop_vinfo)
1957       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1958       || loop->inner)
1959     do_peeling = false;
1960
1961   struct _vect_peel_extended_info peel_for_known_alignment;
1962   struct _vect_peel_extended_info peel_for_unknown_alignment;
1963   struct _vect_peel_extended_info best_peel;
1964
1965   peel_for_unknown_alignment.inside_cost = INT_MAX;
1966   peel_for_unknown_alignment.outside_cost = INT_MAX;
1967   peel_for_unknown_alignment.peel_info.count = 0;
1968
1969   if (do_peeling
1970       && one_misalignment_unknown)
1971     {
1972       /* Check if the target requires to prefer stores over loads, i.e., if
1973          misaligned stores are more expensive than misaligned loads (taking
1974          drs with same alignment into account).  */
1975       unsigned int load_inside_cost = 0;
1976       unsigned int load_outside_cost = 0;
1977       unsigned int store_inside_cost = 0;
1978       unsigned int store_outside_cost = 0;
1979       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1980
1981       stmt_vector_for_cost dummy;
1982       dummy.create (2);
1983       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1984                                       &load_inside_cost,
1985                                       &load_outside_cost,
1986                                       &dummy, &dummy, estimated_npeels, true);
1987       dummy.release ();
1988
1989       if (first_store)
1990         {
1991           dummy.create (2);
1992           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1993                                           &store_inside_cost,
1994                                           &store_outside_cost,
1995                                           &dummy, &dummy,
1996                                           estimated_npeels, true);
1997           dummy.release ();
1998         }
1999       else
2000         {
2001           store_inside_cost = INT_MAX;
2002           store_outside_cost = INT_MAX;
2003         }
2004
2005       if (load_inside_cost > store_inside_cost
2006           || (load_inside_cost == store_inside_cost
2007               && load_outside_cost > store_outside_cost))
2008         {
2009           dr0_info = first_store;
2010           dr0_same_align_drs = first_store_same_align_drs;
2011           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2012           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2013         }
2014       else
2015         {
2016           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2017           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2018         }
2019
2020       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2021       prologue_cost_vec.create (2);
2022       epilogue_cost_vec.create (2);
2023
2024       int dummy2;
2025       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2026         (loop_vinfo, estimated_npeels, &dummy2,
2027          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2028          &prologue_cost_vec, &epilogue_cost_vec);
2029
2030       prologue_cost_vec.release ();
2031       epilogue_cost_vec.release ();
2032
2033       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2034     }
2035
2036   peel_for_unknown_alignment.peel_info.npeel = 0;
2037   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2038
2039   best_peel = peel_for_unknown_alignment;
2040
2041   peel_for_known_alignment.inside_cost = INT_MAX;
2042   peel_for_known_alignment.outside_cost = INT_MAX;
2043   peel_for_known_alignment.peel_info.count = 0;
2044   peel_for_known_alignment.peel_info.dr_info = NULL;
2045
2046   if (do_peeling && one_misalignment_known)
2047     {
2048       /* Peeling is possible, but there is no data access that is not supported
2049          unless aligned.  So we try to choose the best possible peeling from
2050          the hash table.  */
2051       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2052         (&peeling_htab, loop_vinfo);
2053     }
2054
2055   /* Compare costs of peeling for known and unknown alignment. */
2056   if (peel_for_known_alignment.peel_info.dr_info != NULL
2057       && peel_for_unknown_alignment.inside_cost
2058       >= peel_for_known_alignment.inside_cost)
2059     {
2060       best_peel = peel_for_known_alignment;
2061
2062       /* If the best peeling for known alignment has NPEEL == 0, perform no
2063          peeling at all except if there is an unsupportable dr that we can
2064          align.  */
2065       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2066         do_peeling = false;
2067     }
2068
2069   /* If there is an unsupportable data ref, prefer this over all choices so far
2070      since we'd have to discard a chosen peeling except when it accidentally
2071      aligned the unsupportable data ref.  */
2072   if (one_dr_unsupportable)
2073     dr0_info = unsupportable_dr_info;
2074   else if (do_peeling)
2075     {
2076       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2077          TODO: Use nopeel_outside_cost or get rid of it?  */
2078       unsigned nopeel_inside_cost = 0;
2079       unsigned nopeel_outside_cost = 0;
2080
2081       stmt_vector_for_cost dummy;
2082       dummy.create (2);
2083       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2084                                       &nopeel_outside_cost, &dummy, &dummy,
2085                                       0, false);
2086       dummy.release ();
2087
2088       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2089          costs will be recorded.  */
2090       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2091       prologue_cost_vec.create (2);
2092       epilogue_cost_vec.create (2);
2093
2094       int dummy2;
2095       nopeel_outside_cost += vect_get_known_peeling_cost
2096         (loop_vinfo, 0, &dummy2,
2097          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2098          &prologue_cost_vec, &epilogue_cost_vec);
2099
2100       prologue_cost_vec.release ();
2101       epilogue_cost_vec.release ();
2102
2103       npeel = best_peel.peel_info.npeel;
2104       dr0_info = best_peel.peel_info.dr_info;
2105
2106       /* If no peeling is not more expensive than the best peeling we
2107          have so far, don't perform any peeling.  */
2108       if (nopeel_inside_cost <= best_peel.inside_cost)
2109         do_peeling = false;
2110     }
2111
2112   if (do_peeling)
2113     {
2114       stmt_vec_info stmt_info = dr0_info->stmt;
2115       if (known_alignment_for_access_p (dr0_info))
2116         {
2117           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2118                                                 size_zero_node) < 0;
2119           if (!npeel)
2120             {
2121               /* Since it's known at compile time, compute the number of
2122                  iterations in the peeled loop (the peeling factor) for use in
2123                  updating DR_MISALIGNMENT values.  The peeling factor is the
2124                  vectorization factor minus the misalignment as an element
2125                  count.  */
2126               mis = (negative
2127                      ? DR_MISALIGNMENT (dr0_info)
2128                      : -DR_MISALIGNMENT (dr0_info));
2129               /* If known_alignment_for_access_p then we have set
2130                  DR_MISALIGNMENT which is only done if we know it at compiler
2131                  time, so it is safe to assume target alignment is constant.
2132                */
2133               unsigned int target_align =
2134                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2135               npeel = ((mis & (target_align - 1))
2136                        / vect_get_scalar_dr_size (dr0_info));
2137             }
2138
2139           /* For interleaved data access every iteration accesses all the
2140              members of the group, therefore we divide the number of iterations
2141              by the group size.  */
2142           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2143             npeel /= DR_GROUP_SIZE (stmt_info);
2144
2145           if (dump_enabled_p ())
2146             dump_printf_loc (MSG_NOTE, vect_location,
2147                              "Try peeling by %d\n", npeel);
2148         }
2149
2150       /* Ensure that all datarefs can be vectorized after the peel.  */
2151       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2152         do_peeling = false;
2153
2154       /* Check if all datarefs are supportable and log.  */
2155       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2156         return opt_result::success ();
2157
2158       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2159       if (do_peeling)
2160         {
2161           unsigned max_allowed_peel
2162             = param_vect_max_peeling_for_alignment;
2163           if (flag_vect_cost_model <= VECT_COST_MODEL_CHEAP)
2164             max_allowed_peel = 0;
2165           if (max_allowed_peel != (unsigned)-1)
2166             {
2167               unsigned max_peel = npeel;
2168               if (max_peel == 0)
2169                 {
2170                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2171                   unsigned HOST_WIDE_INT target_align_c;
2172                   if (target_align.is_constant (&target_align_c))
2173                     max_peel =
2174                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2175                   else
2176                     {
2177                       do_peeling = false;
2178                       if (dump_enabled_p ())
2179                         dump_printf_loc (MSG_NOTE, vect_location,
2180                           "Disable peeling, max peels set and vector"
2181                           " alignment unknown\n");
2182                     }
2183                 }
2184               if (max_peel > max_allowed_peel)
2185                 {
2186                   do_peeling = false;
2187                   if (dump_enabled_p ())
2188                     dump_printf_loc (MSG_NOTE, vect_location,
2189                         "Disable peeling, max peels reached: %d\n", max_peel);
2190                 }
2191             }
2192         }
2193
2194       /* Cost model #2 - if peeling may result in a remaining loop not
2195          iterating enough to be vectorized then do not peel.  Since this
2196          is a cost heuristic rather than a correctness decision, use the
2197          most likely runtime value for variable vectorization factors.  */
2198       if (do_peeling
2199           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2200         {
2201           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2202           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2203           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2204               < assumed_vf + max_peel)
2205             do_peeling = false;
2206         }
2207
2208       if (do_peeling)
2209         {
2210           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2211              If the misalignment of DR_i is identical to that of dr0 then set
2212              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2213              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2214              by the peeling factor times the element size of DR_i (MOD the
2215              vectorization factor times the size).  Otherwise, the
2216              misalignment of DR_i must be set to unknown.  */
2217           FOR_EACH_VEC_ELT (datarefs, i, dr)
2218             if (dr != dr0_info->dr)
2219               {
2220                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2221                 if (!vect_relevant_for_alignment_p (dr_info))
2222                   continue;
2223
2224                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2225               }
2226
2227           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2228           if (npeel)
2229             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2230           else
2231             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2232               = DR_MISALIGNMENT (dr0_info);
2233           SET_DR_MISALIGNMENT (dr0_info, 0);
2234           if (dump_enabled_p ())
2235             {
2236               dump_printf_loc (MSG_NOTE, vect_location,
2237                                "Alignment of access forced using peeling.\n");
2238               dump_printf_loc (MSG_NOTE, vect_location,
2239                                "Peeling for alignment will be applied.\n");
2240             }
2241
2242           /* The inside-loop cost will be accounted for in vectorizable_load
2243              and vectorizable_store correctly with adjusted alignments.
2244              Drop the body_cst_vec on the floor here.  */
2245           return opt_result::success ();
2246         }
2247     }
2248
2249   /* (2) Versioning to force alignment.  */
2250
2251   /* Try versioning if:
2252      1) optimize loop for speed and the cost-model is not cheap
2253      2) there is at least one unsupported misaligned data ref with an unknown
2254         misalignment, and
2255      3) all misaligned data refs with a known misalignment are supported, and
2256      4) the number of runtime alignment checks is within reason.  */
2257
2258   do_versioning
2259     = (optimize_loop_nest_for_speed_p (loop)
2260        && !loop->inner /* FORNOW */
2261        && flag_vect_cost_model > VECT_COST_MODEL_CHEAP);
2262
2263   if (do_versioning)
2264     {
2265       FOR_EACH_VEC_ELT (datarefs, i, dr)
2266         {
2267           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2268           if (aligned_access_p (dr_info)
2269               || !vect_relevant_for_alignment_p (dr_info))
2270             continue;
2271
2272           stmt_vec_info stmt_info = dr_info->stmt;
2273           if (STMT_VINFO_STRIDED_P (stmt_info))
2274             {
2275               do_versioning = false;
2276               break;
2277             }
2278
2279           supportable_dr_alignment
2280             = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2281
2282           if (!supportable_dr_alignment)
2283             {
2284               int mask;
2285               tree vectype;
2286
2287               if (known_alignment_for_access_p (dr_info)
2288                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2289                   >= (unsigned) param_vect_max_version_for_alignment_checks)
2290                 {
2291                   do_versioning = false;
2292                   break;
2293                 }
2294
2295               vectype = STMT_VINFO_VECTYPE (stmt_info);
2296               gcc_assert (vectype);
2297
2298               /* At present we don't support versioning for alignment
2299                  with variable VF, since there's no guarantee that the
2300                  VF is a power of two.  We could relax this if we added
2301                  a way of enforcing a power-of-two size.  */
2302               unsigned HOST_WIDE_INT size;
2303               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2304                 {
2305                   do_versioning = false;
2306                   break;
2307                 }
2308
2309               /* Forcing alignment in the first iteration is no good if
2310                  we don't keep it across iterations.  For now, just disable
2311                  versioning in this case.
2312                  ?? We could actually unroll the loop to achieve the required
2313                  overall step alignment, and forcing the alignment could be
2314                  done by doing some iterations of the non-vectorized loop.  */
2315               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2316                                * DR_STEP_ALIGNMENT (dr),
2317                                DR_TARGET_ALIGNMENT (dr_info)))
2318                 {
2319                   do_versioning = false;
2320                   break;
2321                 }
2322
2323               /* The rightmost bits of an aligned address must be zeros.
2324                  Construct the mask needed for this test.  For example,
2325                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2326                  mask must be 15 = 0xf. */
2327               mask = size - 1;
2328
2329               /* FORNOW: use the same mask to test all potentially unaligned
2330                  references in the loop.  */
2331               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2332                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2333                 {
2334                   do_versioning = false;
2335                   break;
2336                 }
2337
2338               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2339               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2340             }
2341         }
2342
2343       /* Versioning requires at least one misaligned data reference.  */
2344       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2345         do_versioning = false;
2346       else if (!do_versioning)
2347         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2348     }
2349
2350   if (do_versioning)
2351     {
2352       const vec<stmt_vec_info> &may_misalign_stmts
2353         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2354       stmt_vec_info stmt_info;
2355
2356       /* It can now be assumed that the data references in the statements
2357          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2358          of the loop being vectorized.  */
2359       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2360         {
2361           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2362           SET_DR_MISALIGNMENT (dr_info, 0);
2363           if (dump_enabled_p ())
2364             dump_printf_loc (MSG_NOTE, vect_location,
2365                              "Alignment of access forced using versioning.\n");
2366         }
2367
2368       if (dump_enabled_p ())
2369         dump_printf_loc (MSG_NOTE, vect_location,
2370                          "Versioning for alignment will be applied.\n");
2371
2372       /* Peeling and versioning can't be done together at this time.  */
2373       gcc_assert (! (do_peeling && do_versioning));
2374
2375       return opt_result::success ();
2376     }
2377
2378   /* This point is reached if neither peeling nor versioning is being done.  */
2379   gcc_assert (! (do_peeling || do_versioning));
2380
2381   return opt_result::success ();
2382 }
2383
2384
2385 /* Function vect_analyze_data_refs_alignment
2386
2387    Analyze the alignment of the data-references in the loop.
2388    Return FALSE if a data reference is found that cannot be vectorized.  */
2389
2390 opt_result
2391 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2392 {
2393   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2394
2395   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2396   struct data_reference *dr;
2397   unsigned int i;
2398
2399   vect_record_base_alignments (loop_vinfo);
2400   FOR_EACH_VEC_ELT (datarefs, i, dr)
2401     {
2402       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2403       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2404         vect_compute_data_ref_alignment (loop_vinfo, dr_info);
2405     }
2406
2407   return opt_result::success ();
2408 }
2409
2410
2411 /* Analyze alignment of DRs of stmts in NODE.  */
2412
2413 static bool
2414 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2415 {
2416   /* We vectorize from the first scalar stmt in the node unless
2417      the node is permuted in which case we start from the first
2418      element in the group.  */
2419   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2420   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2421   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2422     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2423
2424   /* We need to commit to a vector type for the group now.  */
2425   if (is_a <bb_vec_info> (vinfo)
2426       && !vect_update_shared_vectype (first_stmt_info, SLP_TREE_VECTYPE (node)))
2427     {
2428       if (dump_enabled_p ())
2429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430                          "desired vector type conflicts with earlier one "
2431                          "for %G", first_stmt_info->stmt);
2432       return false;
2433     }
2434
2435   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2436   vect_compute_data_ref_alignment (vinfo, dr_info);
2437   /* In several places we need alignment of the first element anyway.  */
2438   if (dr_info != first_dr_info)
2439     vect_compute_data_ref_alignment (vinfo, first_dr_info);
2440
2441   /* For creating the data-ref pointer we need alignment of the
2442      first element as well.  */
2443   first_stmt_info
2444     = vect_stmt_to_vectorize (vect_find_first_scalar_stmt_in_slp (node));
2445   if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0])
2446     {
2447       first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2448       if (dr_info != first_dr_info)
2449         vect_compute_data_ref_alignment (vinfo, first_dr_info);
2450     }
2451
2452   return true;
2453 }
2454
2455 /* Function vect_slp_analyze_instance_alignment
2456
2457    Analyze the alignment of the data-references in the SLP instance.
2458    Return FALSE if a data reference is found that cannot be vectorized.  */
2459
2460 bool
2461 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2462                                                 slp_instance instance)
2463 {
2464   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2465
2466   slp_tree node;
2467   unsigned i;
2468   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2469     if (! vect_slp_analyze_node_alignment (vinfo, node))
2470       return false;
2471
2472   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2473       && ! vect_slp_analyze_node_alignment
2474              (vinfo, SLP_INSTANCE_TREE (instance)))
2475     return false;
2476
2477   return true;
2478 }
2479
2480
2481 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2482    accesses of legal size, step, etc.  Detect gaps, single element
2483    interleaving, and other special cases. Set grouped access info.
2484    Collect groups of strided stores for further use in SLP analysis.
2485    Worker for vect_analyze_group_access.  */
2486
2487 static bool
2488 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2489 {
2490   data_reference *dr = dr_info->dr;
2491   tree step = DR_STEP (dr);
2492   tree scalar_type = TREE_TYPE (DR_REF (dr));
2493   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2494   stmt_vec_info stmt_info = dr_info->stmt;
2495   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2496   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2497   HOST_WIDE_INT dr_step = -1;
2498   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2499   bool slp_impossible = false;
2500
2501   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2502      size of the interleaving group (including gaps).  */
2503   if (tree_fits_shwi_p (step))
2504     {
2505       dr_step = tree_to_shwi (step);
2506       /* Check that STEP is a multiple of type size.  Otherwise there is
2507          a non-element-sized gap at the end of the group which we
2508          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2509          ???  As we can handle non-constant step fine here we should
2510          simply remove uses of DR_GROUP_GAP between the last and first
2511          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2512          simply not include that gap.  */
2513       if ((dr_step % type_size) != 0)
2514         {
2515           if (dump_enabled_p ())
2516             dump_printf_loc (MSG_NOTE, vect_location,
2517                              "Step %T is not a multiple of the element size"
2518                              " for %T\n",
2519                              step, DR_REF (dr));
2520           return false;
2521         }
2522       groupsize = absu_hwi (dr_step) / type_size;
2523     }
2524   else
2525     groupsize = 0;
2526
2527   /* Not consecutive access is possible only if it is a part of interleaving.  */
2528   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2529     {
2530       /* Check if it this DR is a part of interleaving, and is a single
2531          element of the group that is accessed in the loop.  */
2532
2533       /* Gaps are supported only for loads. STEP must be a multiple of the type
2534          size.  */
2535       if (DR_IS_READ (dr)
2536           && (dr_step % type_size) == 0
2537           && groupsize > 0
2538           /* This could be UINT_MAX but as we are generating code in a very
2539              inefficient way we have to cap earlier.
2540              See PR91403 for example.  */
2541           && groupsize <= 4096)
2542         {
2543           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2544           DR_GROUP_SIZE (stmt_info) = groupsize;
2545           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2546           if (dump_enabled_p ())
2547             dump_printf_loc (MSG_NOTE, vect_location,
2548                              "Detected single element interleaving %T"
2549                              " step %T\n",
2550                              DR_REF (dr), step);
2551
2552           return true;
2553         }
2554
2555       if (dump_enabled_p ())
2556         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2557                          "not consecutive access %G", stmt_info->stmt);
2558
2559       if (bb_vinfo)
2560         {
2561           /* Mark the statement as unvectorizable.  */
2562           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2563           return true;
2564         }
2565
2566       if (dump_enabled_p ())
2567         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2568       STMT_VINFO_STRIDED_P (stmt_info) = true;
2569       return true;
2570     }
2571
2572   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2573     {
2574       /* First stmt in the interleaving chain. Check the chain.  */
2575       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2576       struct data_reference *data_ref = dr;
2577       unsigned int count = 1;
2578       tree prev_init = DR_INIT (data_ref);
2579       HOST_WIDE_INT diff, gaps = 0;
2580
2581       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2582       while (next)
2583         {
2584           /* We never have the same DR multiple times.  */
2585           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2586                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2587
2588           data_ref = STMT_VINFO_DATA_REF (next);
2589
2590           /* All group members have the same STEP by construction.  */
2591           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2592
2593           /* Check that the distance between two accesses is equal to the type
2594              size. Otherwise, we have gaps.  */
2595           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2596                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2597           if (diff != 1)
2598             {
2599               /* FORNOW: SLP of accesses with gaps is not supported.  */
2600               slp_impossible = true;
2601               if (DR_IS_WRITE (data_ref))
2602                 {
2603                   if (dump_enabled_p ())
2604                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2605                                      "interleaved store with gaps\n");
2606                   return false;
2607                 }
2608
2609               gaps += diff - 1;
2610             }
2611
2612           last_accessed_element += diff;
2613
2614           /* Store the gap from the previous member of the group. If there is no
2615              gap in the access, DR_GROUP_GAP is always 1.  */
2616           DR_GROUP_GAP (next) = diff;
2617
2618           prev_init = DR_INIT (data_ref);
2619           next = DR_GROUP_NEXT_ELEMENT (next);
2620           /* Count the number of data-refs in the chain.  */
2621           count++;
2622         }
2623
2624       if (groupsize == 0)
2625         groupsize = count + gaps;
2626
2627       /* This could be UINT_MAX but as we are generating code in a very
2628          inefficient way we have to cap earlier.  See PR78699 for example.  */
2629       if (groupsize > 4096)
2630         {
2631           if (dump_enabled_p ())
2632             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2633                              "group is too large\n");
2634           return false;
2635         }
2636
2637       /* Check that the size of the interleaving is equal to count for stores,
2638          i.e., that there are no gaps.  */
2639       if (groupsize != count
2640           && !DR_IS_READ (dr))
2641         {
2642           groupsize = count;
2643           STMT_VINFO_STRIDED_P (stmt_info) = true;
2644         }
2645
2646       /* If there is a gap after the last load in the group it is the
2647          difference between the groupsize and the last accessed
2648          element.
2649          When there is no gap, this difference should be 0.  */
2650       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2651
2652       DR_GROUP_SIZE (stmt_info) = groupsize;
2653       if (dump_enabled_p ())
2654         {
2655           dump_printf_loc (MSG_NOTE, vect_location,
2656                            "Detected interleaving ");
2657           if (DR_IS_READ (dr))
2658             dump_printf (MSG_NOTE, "load ");
2659           else if (STMT_VINFO_STRIDED_P (stmt_info))
2660             dump_printf (MSG_NOTE, "strided store ");
2661           else
2662             dump_printf (MSG_NOTE, "store ");
2663           dump_printf (MSG_NOTE, "of size %u\n",
2664                        (unsigned)groupsize);
2665           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2666           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2667           while (next)
2668             {
2669               if (DR_GROUP_GAP (next) != 1)
2670                 dump_printf_loc (MSG_NOTE, vect_location,
2671                                  "\t<gap of %d elements>\n",
2672                                  DR_GROUP_GAP (next) - 1);
2673               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2674               next = DR_GROUP_NEXT_ELEMENT (next);
2675             }
2676           if (DR_GROUP_GAP (stmt_info) != 0)
2677             dump_printf_loc (MSG_NOTE, vect_location,
2678                              "\t<gap of %d elements>\n",
2679                              DR_GROUP_GAP (stmt_info));
2680         }
2681
2682       /* SLP: create an SLP data structure for every interleaving group of
2683          stores for further analysis in vect_analyse_slp.  */
2684       if (DR_IS_WRITE (dr) && !slp_impossible)
2685         {
2686           if (loop_vinfo)
2687             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2688           if (bb_vinfo)
2689             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2690         }
2691     }
2692
2693   return true;
2694 }
2695
2696 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2697    accesses of legal size, step, etc.  Detect gaps, single element
2698    interleaving, and other special cases. Set grouped access info.
2699    Collect groups of strided stores for further use in SLP analysis.  */
2700
2701 static bool
2702 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2703 {
2704   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2705     {
2706       /* Dissolve the group if present.  */
2707       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2708       while (stmt_info)
2709         {
2710           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2711           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2712           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2713           stmt_info = next;
2714         }
2715       return false;
2716     }
2717   return true;
2718 }
2719
2720 /* Analyze the access pattern of the data-reference DR_INFO.
2721    In case of non-consecutive accesses call vect_analyze_group_access() to
2722    analyze groups of accesses.  */
2723
2724 static bool
2725 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2726 {
2727   data_reference *dr = dr_info->dr;
2728   tree step = DR_STEP (dr);
2729   tree scalar_type = TREE_TYPE (DR_REF (dr));
2730   stmt_vec_info stmt_info = dr_info->stmt;
2731   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2732   class loop *loop = NULL;
2733
2734   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2735     return true;
2736
2737   if (loop_vinfo)
2738     loop = LOOP_VINFO_LOOP (loop_vinfo);
2739
2740   if (loop_vinfo && !step)
2741     {
2742       if (dump_enabled_p ())
2743         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2744                          "bad data-ref access in loop\n");
2745       return false;
2746     }
2747
2748   /* Allow loads with zero step in inner-loop vectorization.  */
2749   if (loop_vinfo && integer_zerop (step))
2750     {
2751       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2752       if (!nested_in_vect_loop_p (loop, stmt_info))
2753         return DR_IS_READ (dr);
2754       /* Allow references with zero step for outer loops marked
2755          with pragma omp simd only - it guarantees absence of
2756          loop-carried dependencies between inner loop iterations.  */
2757       if (loop->safelen < 2)
2758         {
2759           if (dump_enabled_p ())
2760             dump_printf_loc (MSG_NOTE, vect_location,
2761                              "zero step in inner loop of nest\n");
2762           return false;
2763         }
2764     }
2765
2766   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2767     {
2768       /* Interleaved accesses are not yet supported within outer-loop
2769         vectorization for references in the inner-loop.  */
2770       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2771
2772       /* For the rest of the analysis we use the outer-loop step.  */
2773       step = STMT_VINFO_DR_STEP (stmt_info);
2774       if (integer_zerop (step))
2775         {
2776           if (dump_enabled_p ())
2777             dump_printf_loc (MSG_NOTE, vect_location,
2778                              "zero step in outer loop.\n");
2779           return DR_IS_READ (dr);
2780         }
2781     }
2782
2783   /* Consecutive?  */
2784   if (TREE_CODE (step) == INTEGER_CST)
2785     {
2786       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2787       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2788           || (dr_step < 0
2789               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2790         {
2791           /* Mark that it is not interleaving.  */
2792           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2793           return true;
2794         }
2795     }
2796
2797   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2798     {
2799       if (dump_enabled_p ())
2800         dump_printf_loc (MSG_NOTE, vect_location,
2801                          "grouped access in outer loop.\n");
2802       return false;
2803     }
2804
2805
2806   /* Assume this is a DR handled by non-constant strided load case.  */
2807   if (TREE_CODE (step) != INTEGER_CST)
2808     return (STMT_VINFO_STRIDED_P (stmt_info)
2809             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2810                 || vect_analyze_group_access (vinfo, dr_info)));
2811
2812   /* Not consecutive access - check if it's a part of interleaving group.  */
2813   return vect_analyze_group_access (vinfo, dr_info);
2814 }
2815
2816 typedef std::pair<data_reference_p, int> data_ref_pair;
2817
2818 /* Compare two data-references DRA and DRB to group them into chunks
2819    suitable for grouping.  */
2820
2821 static int
2822 dr_group_sort_cmp (const void *dra_, const void *drb_)
2823 {
2824   data_ref_pair dra_pair = *(data_ref_pair *)const_cast<void *>(dra_);
2825   data_ref_pair drb_pair = *(data_ref_pair *)const_cast<void *>(drb_);
2826   data_reference_p dra = dra_pair.first;
2827   data_reference_p drb = drb_pair.first;
2828   int cmp;
2829
2830   /* Stabilize sort.  */
2831   if (dra == drb)
2832     return 0;
2833
2834   /* Different group IDs lead never belong to the same group.  */
2835   if (dra_pair.second != drb_pair.second)
2836     return dra_pair.second < drb_pair.second ? -1 : 1;
2837
2838   /* Ordering of DRs according to base.  */
2839   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2840                                DR_BASE_ADDRESS (drb));
2841   if (cmp != 0)
2842     return cmp;
2843
2844   /* And according to DR_OFFSET.  */
2845   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2846   if (cmp != 0)
2847     return cmp;
2848
2849   /* Put reads before writes.  */
2850   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2851     return DR_IS_READ (dra) ? -1 : 1;
2852
2853   /* Then sort after access size.  */
2854   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2855                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2856   if (cmp != 0)
2857     return cmp;
2858
2859   /* And after step.  */
2860   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2861   if (cmp != 0)
2862     return cmp;
2863
2864   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2865   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2866   if (cmp == 0)
2867     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2868   return cmp;
2869 }
2870
2871 /* If OP is the result of a conversion, return the unconverted value,
2872    otherwise return null.  */
2873
2874 static tree
2875 strip_conversion (tree op)
2876 {
2877   if (TREE_CODE (op) != SSA_NAME)
2878     return NULL_TREE;
2879   gimple *stmt = SSA_NAME_DEF_STMT (op);
2880   if (!is_gimple_assign (stmt)
2881       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2882     return NULL_TREE;
2883   return gimple_assign_rhs1 (stmt);
2884 }
2885
2886 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2887    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2888    be grouped in SLP mode.  */
2889
2890 static bool
2891 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2892                    bool allow_slp_p)
2893 {
2894   if (gimple_assign_single_p (stmt1_info->stmt))
2895     return gimple_assign_single_p (stmt2_info->stmt);
2896
2897   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2898   if (call1 && gimple_call_internal_p (call1))
2899     {
2900       /* Check for two masked loads or two masked stores.  */
2901       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2902       if (!call2 || !gimple_call_internal_p (call2))
2903         return false;
2904       internal_fn ifn = gimple_call_internal_fn (call1);
2905       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2906         return false;
2907       if (ifn != gimple_call_internal_fn (call2))
2908         return false;
2909
2910       /* Check that the masks are the same.  Cope with casts of masks,
2911          like those created by build_mask_conversion.  */
2912       tree mask1 = gimple_call_arg (call1, 2);
2913       tree mask2 = gimple_call_arg (call2, 2);
2914       if (!operand_equal_p (mask1, mask2, 0)
2915           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2916         {
2917           mask1 = strip_conversion (mask1);
2918           if (!mask1)
2919             return false;
2920           mask2 = strip_conversion (mask2);
2921           if (!mask2)
2922             return false;
2923           if (!operand_equal_p (mask1, mask2, 0))
2924             return false;
2925         }
2926       return true;
2927     }
2928
2929   return false;
2930 }
2931
2932 /* Function vect_analyze_data_ref_accesses.
2933
2934    Analyze the access pattern of all the data references in the loop.
2935
2936    FORNOW: the only access pattern that is considered vectorizable is a
2937            simple step 1 (consecutive) access.
2938
2939    FORNOW: handle only arrays and pointer accesses.  */
2940
2941 opt_result
2942 vect_analyze_data_ref_accesses (vec_info *vinfo,
2943                                 vec<int> *dataref_groups)
2944 {
2945   unsigned int i;
2946   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2947
2948   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2949
2950   if (datarefs.is_empty ())
2951     return opt_result::success ();
2952
2953   /* Sort the array of datarefs to make building the interleaving chains
2954      linear.  Don't modify the original vector's order, it is needed for
2955      determining what dependencies are reversed.  */
2956   vec<data_ref_pair> datarefs_copy;
2957   datarefs_copy.create (datarefs.length ());
2958   for (unsigned i = 0; i < datarefs.length (); i++)
2959     {
2960       int group_id;
2961       /* If the caller computed DR grouping use that, otherwise group by
2962          basic blocks.  */
2963       if (dataref_groups)
2964         group_id = (*dataref_groups)[i];
2965       else
2966         group_id = gimple_bb (DR_STMT (datarefs[i]))->index;
2967       datarefs_copy.quick_push (data_ref_pair (datarefs[i], group_id));
2968     }
2969   datarefs_copy.qsort (dr_group_sort_cmp);
2970   hash_set<stmt_vec_info> to_fixup;
2971
2972   /* Build the interleaving chains.  */
2973   for (i = 0; i < datarefs_copy.length () - 1;)
2974     {
2975       data_reference_p dra = datarefs_copy[i].first;
2976       int dra_group_id = datarefs_copy[i].second;
2977       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2978       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2979       stmt_vec_info lastinfo = NULL;
2980       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2981           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2982         {
2983           ++i;
2984           continue;
2985         }
2986       for (i = i + 1; i < datarefs_copy.length (); ++i)
2987         {
2988           data_reference_p drb = datarefs_copy[i].first;
2989           int drb_group_id = datarefs_copy[i].second;
2990           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2991           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2992           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2993               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2994             break;
2995
2996           /* ???  Imperfect sorting (non-compatible types, non-modulo
2997              accesses, same accesses) can lead to a group to be artificially
2998              split here as we don't just skip over those.  If it really
2999              matters we can push those to a worklist and re-iterate
3000              over them.  The we can just skip ahead to the next DR here.  */
3001
3002           /* DRs in a different DR group should not be put into the same
3003              interleaving group.  */
3004           if (dra_group_id != drb_group_id)
3005             break;
3006
3007           /* Check that the data-refs have same first location (except init)
3008              and they are both either store or load (not load and store,
3009              not masked loads or stores).  */
3010           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3011               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3012                                         DR_BASE_ADDRESS (drb)) != 0
3013               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3014               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3015             break;
3016
3017           /* Check that the data-refs have the same constant size.  */
3018           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3019           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3020           if (!tree_fits_uhwi_p (sza)
3021               || !tree_fits_uhwi_p (szb)
3022               || !tree_int_cst_equal (sza, szb))
3023             break;
3024
3025           /* Check that the data-refs have the same step.  */
3026           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3027             break;
3028
3029           /* Check the types are compatible.
3030              ???  We don't distinguish this during sorting.  */
3031           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3032                                    TREE_TYPE (DR_REF (drb))))
3033             break;
3034
3035           /* Check that the DR_INITs are compile-time constants.  */
3036           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3037               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3038             break;
3039
3040           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3041              just hold extra information.  */
3042           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3043               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3044               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3045             break;
3046
3047           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3048           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3049           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3050           HOST_WIDE_INT init_prev
3051             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1].first));
3052           gcc_assert (init_a <= init_b
3053                       && init_a <= init_prev
3054                       && init_prev <= init_b);
3055
3056           /* Do not place the same access in the interleaving chain twice.  */
3057           if (init_b == init_prev)
3058             {
3059               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1].first))
3060                           < gimple_uid (DR_STMT (drb)));
3061               /* Simply link in duplicates and fix up the chain below.  */
3062             }
3063           else
3064             {
3065               /* If init_b == init_a + the size of the type * k, we have an
3066                  interleaving, and DRA is accessed before DRB.  */
3067               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3068               if (type_size_a == 0
3069                   || (init_b - init_a) % type_size_a != 0)
3070                 break;
3071
3072               /* If we have a store, the accesses are adjacent.  This splits
3073                  groups into chunks we support (we don't support vectorization
3074                  of stores with gaps).  */
3075               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3076                 break;
3077
3078               /* If the step (if not zero or non-constant) is smaller than the
3079                  difference between data-refs' inits this splits groups into
3080                  suitable sizes.  */
3081               if (tree_fits_shwi_p (DR_STEP (dra)))
3082                 {
3083                   unsigned HOST_WIDE_INT step
3084                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3085                   if (step != 0
3086                       && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3087                     break;
3088                 }
3089             }
3090
3091           if (dump_enabled_p ())
3092             dump_printf_loc (MSG_NOTE, vect_location,
3093                              DR_IS_READ (dra)
3094                              ? "Detected interleaving load %T and %T\n"
3095                              : "Detected interleaving store %T and %T\n",
3096                              DR_REF (dra), DR_REF (drb));
3097
3098           /* Link the found element into the group list.  */
3099           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3100             {
3101               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3102               lastinfo = stmtinfo_a;
3103             }
3104           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3105           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3106           lastinfo = stmtinfo_b;
3107
3108           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3109             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3110
3111           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3112             dump_printf_loc (MSG_NOTE, vect_location,
3113                              "Load suitable for SLP vectorization only.\n");
3114
3115           if (init_b == init_prev
3116               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3117               && dump_enabled_p ())
3118             dump_printf_loc (MSG_NOTE, vect_location,
3119                              "Queuing group with duplicate access for fixup\n");
3120         }
3121     }
3122
3123   /* Fixup groups with duplicate entries by splitting it.  */
3124   while (1)
3125     {
3126       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3127       if (!(it != to_fixup.end ()))
3128         break;
3129       stmt_vec_info grp = *it;
3130       to_fixup.remove (grp);
3131
3132       /* Find the earliest duplicate group member.  */
3133       unsigned first_duplicate = -1u;
3134       stmt_vec_info next, g = grp;
3135       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3136         {
3137           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3138                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3139               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3140             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3141           g = next;
3142         }
3143       if (first_duplicate == -1U)
3144         continue;
3145
3146       /* Then move all stmts after the first duplicate to a new group.
3147          Note this is a heuristic but one with the property that *it
3148          is fixed up completely.  */
3149       g = grp;
3150       stmt_vec_info newgroup = NULL, ng = grp;
3151       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3152         {
3153           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3154             {
3155               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3156               if (!newgroup)
3157                 newgroup = next;
3158               else
3159                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3160               ng = next;
3161               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3162             }
3163           else
3164             g = DR_GROUP_NEXT_ELEMENT (g);
3165         }
3166       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3167
3168       /* Fixup the new group which still may contain duplicates.  */
3169       to_fixup.add (newgroup);
3170     }
3171
3172   data_ref_pair *dr_pair;
3173   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_pair)
3174     {
3175       dr_vec_info *dr_info = vinfo->lookup_dr (dr_pair->first);
3176       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3177           && !vect_analyze_data_ref_access (vinfo, dr_info))
3178         {
3179           if (dump_enabled_p ())
3180             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3181                              "not vectorized: complicated access pattern.\n");
3182
3183           if (is_a <bb_vec_info> (vinfo))
3184             {
3185               /* Mark the statement as not vectorizable.  */
3186               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3187               continue;
3188             }
3189           else
3190             {
3191               datarefs_copy.release ();
3192               return opt_result::failure_at (dr_info->stmt->stmt,
3193                                              "not vectorized:"
3194                                              " complicated access pattern.\n");
3195             }
3196         }
3197     }
3198
3199   datarefs_copy.release ();
3200   return opt_result::success ();
3201 }
3202
3203 /* Function vect_vfa_segment_size.
3204
3205    Input:
3206      DR_INFO: The data reference.
3207      LENGTH_FACTOR: segment length to consider.
3208
3209    Return a value suitable for the dr_with_seg_len::seg_len field.
3210    This is the "distance travelled" by the pointer from the first
3211    iteration in the segment to the last.  Note that it does not include
3212    the size of the access; in effect it only describes the first byte.  */
3213
3214 static tree
3215 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3216 {
3217   length_factor = size_binop (MINUS_EXPR,
3218                               fold_convert (sizetype, length_factor),
3219                               size_one_node);
3220   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3221                      length_factor);
3222 }
3223
3224 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3225    gives the worst-case number of bytes covered by the segment.  */
3226
3227 static unsigned HOST_WIDE_INT
3228 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3229 {
3230   stmt_vec_info stmt_vinfo = dr_info->stmt;
3231   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3232   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3233   unsigned HOST_WIDE_INT access_size = ref_size;
3234   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3235     {
3236       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3237       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3238     }
3239   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3240       && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3241           == dr_explicit_realign_optimized))
3242     {
3243       /* We might access a full vector's worth.  */
3244       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3245       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3246     }
3247   return access_size;
3248 }
3249
3250 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3251    describes.  */
3252
3253 static unsigned int
3254 vect_vfa_align (dr_vec_info *dr_info)
3255 {
3256   return dr_alignment (dr_info->dr);
3257 }
3258
3259 /* Function vect_no_alias_p.
3260
3261    Given data references A and B with equal base and offset, see whether
3262    the alias relation can be decided at compilation time.  Return 1 if
3263    it can and the references alias, 0 if it can and the references do
3264    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3265    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3266    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3267
3268 static int
3269 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3270                          tree segment_length_a, tree segment_length_b,
3271                          unsigned HOST_WIDE_INT access_size_a,
3272                          unsigned HOST_WIDE_INT access_size_b)
3273 {
3274   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3275   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3276   poly_uint64 const_length_a;
3277   poly_uint64 const_length_b;
3278
3279   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3280      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3281      [a, a+12) */
3282   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3283     {
3284       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3285       offset_a -= const_length_a;
3286     }
3287   else
3288     const_length_a = tree_to_poly_uint64 (segment_length_a);
3289   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3290     {
3291       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3292       offset_b -= const_length_b;
3293     }
3294   else
3295     const_length_b = tree_to_poly_uint64 (segment_length_b);
3296
3297   const_length_a += access_size_a;
3298   const_length_b += access_size_b;
3299
3300   if (ranges_known_overlap_p (offset_a, const_length_a,
3301                               offset_b, const_length_b))
3302     return 1;
3303
3304   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3305                                offset_b, const_length_b))
3306     return 0;
3307
3308   return -1;
3309 }
3310
3311 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3312    in DDR is >= VF.  */
3313
3314 static bool
3315 dependence_distance_ge_vf (data_dependence_relation *ddr,
3316                            unsigned int loop_depth, poly_uint64 vf)
3317 {
3318   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3319       || DDR_NUM_DIST_VECTS (ddr) == 0)
3320     return false;
3321
3322   /* If the dependence is exact, we should have limited the VF instead.  */
3323   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3324
3325   unsigned int i;
3326   lambda_vector dist_v;
3327   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3328     {
3329       HOST_WIDE_INT dist = dist_v[loop_depth];
3330       if (dist != 0
3331           && !(dist > 0 && DDR_REVERSED_P (ddr))
3332           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3333         return false;
3334     }
3335
3336   if (dump_enabled_p ())
3337     dump_printf_loc (MSG_NOTE, vect_location,
3338                      "dependence distance between %T and %T is >= VF\n",
3339                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3340
3341   return true;
3342 }
3343
3344 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3345
3346 static void
3347 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3348 {
3349   dump_printf (dump_kind, "%s (%T) >= ",
3350                lower_bound.unsigned_p ? "unsigned" : "abs",
3351                lower_bound.expr);
3352   dump_dec (dump_kind, lower_bound.min_value);
3353 }
3354
3355 /* Record that the vectorized loop requires the vec_lower_bound described
3356    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3357
3358 static void
3359 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3360                         poly_uint64 min_value)
3361 {
3362   vec<vec_lower_bound> &lower_bounds
3363     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3364   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3365     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3366       {
3367         unsigned_p &= lower_bounds[i].unsigned_p;
3368         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3369         if (lower_bounds[i].unsigned_p != unsigned_p
3370             || maybe_lt (lower_bounds[i].min_value, min_value))
3371           {
3372             lower_bounds[i].unsigned_p = unsigned_p;
3373             lower_bounds[i].min_value = min_value;
3374             if (dump_enabled_p ())
3375               {
3376                 dump_printf_loc (MSG_NOTE, vect_location,
3377                                  "updating run-time check to ");
3378                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3379                 dump_printf (MSG_NOTE, "\n");
3380               }
3381           }
3382         return;
3383       }
3384
3385   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3386   if (dump_enabled_p ())
3387     {
3388       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3389       dump_lower_bound (MSG_NOTE, lower_bound);
3390       dump_printf (MSG_NOTE, "\n");
3391     }
3392   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3393 }
3394
3395 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3396    will span fewer than GAP bytes.  */
3397
3398 static bool
3399 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3400                   poly_int64 gap)
3401 {
3402   stmt_vec_info stmt_info = dr_info->stmt;
3403   HOST_WIDE_INT count
3404     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3405   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3406     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3407   return (estimated_poly_value (gap)
3408           <= count * vect_get_scalar_dr_size (dr_info));
3409 }
3410
3411 /* Return true if we know that there is no alias between DR_INFO_A and
3412    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3413    When returning true, set *LOWER_BOUND_OUT to this N.  */
3414
3415 static bool
3416 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3417                                 poly_uint64 *lower_bound_out)
3418 {
3419   /* Check that there is a constant gap of known sign between DR_A
3420      and DR_B.  */
3421   data_reference *dr_a = dr_info_a->dr;
3422   data_reference *dr_b = dr_info_b->dr;
3423   poly_int64 init_a, init_b;
3424   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3425       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3426       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3427       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3428       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3429       || !ordered_p (init_a, init_b))
3430     return false;
3431
3432   /* Sort DR_A and DR_B by the address they access.  */
3433   if (maybe_lt (init_b, init_a))
3434     {
3435       std::swap (init_a, init_b);
3436       std::swap (dr_info_a, dr_info_b);
3437       std::swap (dr_a, dr_b);
3438     }
3439
3440   /* If the two accesses could be dependent within a scalar iteration,
3441      make sure that we'd retain their order.  */
3442   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3443       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3444     return false;
3445
3446   /* There is no alias if abs (DR_STEP) is greater than or equal to
3447      the bytes spanned by the combination of the two accesses.  */
3448   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3449   return true;
3450 }
3451
3452 /* Function vect_prune_runtime_alias_test_list.
3453
3454    Prune a list of ddrs to be tested at run-time by versioning for alias.
3455    Merge several alias checks into one if possible.
3456    Return FALSE if resulting list of ddrs is longer then allowed by
3457    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3458
3459 opt_result
3460 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3461 {
3462   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3463   hash_set <tree_pair_hash> compared_objects;
3464
3465   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3466   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3467     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3468   const vec<vec_object_pair> &check_unequal_addrs
3469     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3470   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3471   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3472
3473   ddr_p ddr;
3474   unsigned int i;
3475   tree length_factor;
3476
3477   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3478
3479   /* Step values are irrelevant for aliasing if the number of vector
3480      iterations is equal to the number of scalar iterations (which can
3481      happen for fully-SLP loops).  */
3482   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3483
3484   if (!vf_one_p)
3485     {
3486       /* Convert the checks for nonzero steps into bound tests.  */
3487       tree value;
3488       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3489         vect_check_lower_bound (loop_vinfo, value, true, 1);
3490     }
3491
3492   if (may_alias_ddrs.is_empty ())
3493     return opt_result::success ();
3494
3495   comp_alias_ddrs.create (may_alias_ddrs.length ());
3496
3497   unsigned int loop_depth
3498     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3499                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3500
3501   /* First, we collect all data ref pairs for aliasing checks.  */
3502   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3503     {
3504       poly_uint64 lower_bound;
3505       tree segment_length_a, segment_length_b;
3506       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3507       unsigned int align_a, align_b;
3508
3509       /* Ignore the alias if the VF we chose ended up being no greater
3510          than the dependence distance.  */
3511       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3512         continue;
3513
3514       if (DDR_OBJECT_A (ddr))
3515         {
3516           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3517           if (!compared_objects.add (new_pair))
3518             {
3519               if (dump_enabled_p ())
3520                 dump_printf_loc (MSG_NOTE, vect_location,
3521                                  "checking that %T and %T"
3522                                  " have different addresses\n",
3523                                  new_pair.first, new_pair.second);
3524               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3525             }
3526           continue;
3527         }
3528
3529       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3530       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3531
3532       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3533       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3534
3535       bool preserves_scalar_order_p
3536         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3537       bool ignore_step_p
3538           = (vf_one_p
3539              && (preserves_scalar_order_p
3540                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3541                                      DR_STEP (dr_info_b->dr))));
3542
3543       /* Skip the pair if inter-iteration dependencies are irrelevant
3544          and intra-iteration dependencies are guaranteed to be honored.  */
3545       if (ignore_step_p
3546           && (preserves_scalar_order_p
3547               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3548                                                  &lower_bound)))
3549         {
3550           if (dump_enabled_p ())
3551             dump_printf_loc (MSG_NOTE, vect_location,
3552                              "no need for alias check between "
3553                              "%T and %T when VF is 1\n",
3554                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3555           continue;
3556         }
3557
3558       /* See whether we can handle the alias using a bounds check on
3559          the step, and whether that's likely to be the best approach.
3560          (It might not be, for example, if the minimum step is much larger
3561          than the number of bytes handled by one vector iteration.)  */
3562       if (!ignore_step_p
3563           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3564           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3565                                              &lower_bound)
3566           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3567               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3568         {
3569           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3570           if (dump_enabled_p ())
3571             {
3572               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3573                                "%T and %T when the step %T is outside ",
3574                                DR_REF (dr_info_a->dr),
3575                                DR_REF (dr_info_b->dr),
3576                                DR_STEP (dr_info_a->dr));
3577               if (unsigned_p)
3578                 dump_printf (MSG_NOTE, "[0");
3579               else
3580                 {
3581                   dump_printf (MSG_NOTE, "(");
3582                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3583                 }
3584               dump_printf (MSG_NOTE, ", ");
3585               dump_dec (MSG_NOTE, lower_bound);
3586               dump_printf (MSG_NOTE, ")\n");
3587             }
3588           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3589                                   unsigned_p, lower_bound);
3590           continue;
3591         }
3592
3593       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3594       if (dr_group_first_a)
3595         {
3596           stmt_info_a = dr_group_first_a;
3597           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3598         }
3599
3600       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3601       if (dr_group_first_b)
3602         {
3603           stmt_info_b = dr_group_first_b;
3604           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3605         }
3606
3607       if (ignore_step_p)
3608         {
3609           segment_length_a = size_zero_node;
3610           segment_length_b = size_zero_node;
3611         }
3612       else
3613         {
3614           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3615                                 DR_STEP (dr_info_b->dr), 0))
3616             length_factor = scalar_loop_iters;
3617           else
3618             length_factor = size_int (vect_factor);
3619           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3620           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3621         }
3622       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3623       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3624       align_a = vect_vfa_align (dr_info_a);
3625       align_b = vect_vfa_align (dr_info_b);
3626
3627       /* See whether the alias is known at compilation time.  */
3628       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3629                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3630           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3631                               DR_OFFSET (dr_info_b->dr), 0)
3632           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3633           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3634           && poly_int_tree_p (segment_length_a)
3635           && poly_int_tree_p (segment_length_b))
3636         {
3637           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3638                                              segment_length_a,
3639                                              segment_length_b,
3640                                              access_size_a,
3641                                              access_size_b);
3642           if (res >= 0 && dump_enabled_p ())
3643             {
3644               dump_printf_loc (MSG_NOTE, vect_location,
3645                                "can tell at compile time that %T and %T",
3646                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3647               if (res == 0)
3648                 dump_printf (MSG_NOTE, " do not alias\n");
3649               else
3650                 dump_printf (MSG_NOTE, " alias\n");
3651             }
3652
3653           if (res == 0)
3654             continue;
3655
3656           if (res == 1)
3657             return opt_result::failure_at (stmt_info_b->stmt,
3658                                            "not vectorized:"
3659                                            " compilation time alias: %G%G",
3660                                            stmt_info_a->stmt,
3661                                            stmt_info_b->stmt);
3662         }
3663
3664       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3665                             access_size_a, align_a);
3666       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3667                             access_size_b, align_b);
3668       /* Canonicalize the order to be the one that's needed for accurate
3669          RAW, WAR and WAW flags, in cases where the data references are
3670          well-ordered.  The order doesn't really matter otherwise,
3671          but we might as well be consistent.  */
3672       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3673         std::swap (dr_a, dr_b);
3674
3675       dr_with_seg_len_pair_t dr_with_seg_len_pair
3676         (dr_a, dr_b, (preserves_scalar_order_p
3677                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3678                       : dr_with_seg_len_pair_t::REORDERED));
3679
3680       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3681     }
3682
3683   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3684
3685   unsigned int count = (comp_alias_ddrs.length ()
3686                         + check_unequal_addrs.length ());
3687
3688   if (count && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
3689     return opt_result::failure_at
3690       (vect_location, "would need a runtime alias check\n");
3691
3692   if (dump_enabled_p ())
3693     dump_printf_loc (MSG_NOTE, vect_location,
3694                      "improved number of alias checks from %d to %d\n",
3695                      may_alias_ddrs.length (), count);
3696   unsigned limit = param_vect_max_version_for_alias_checks;
3697   if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3698     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3699   if (count > limit)
3700     return opt_result::failure_at
3701       (vect_location,
3702        "number of versioning for alias run-time tests exceeds %d "
3703        "(--param vect-max-version-for-alias-checks)\n", limit);
3704
3705   return opt_result::success ();
3706 }
3707
3708 /* Check whether we can use an internal function for a gather load
3709    or scatter store.  READ_P is true for loads and false for stores.
3710    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3711    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3712    is the type of the offset that is being applied to the invariant
3713    base address.  SCALE is the amount by which the offset should
3714    be multiplied *after* it has been converted to address width.
3715
3716    Return true if the function is supported, storing the function id in
3717    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3718
3719 bool
3720 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3721                           tree vectype, tree memory_type, tree offset_type,
3722                           int scale, internal_fn *ifn_out,
3723                           tree *offset_vectype_out)
3724 {
3725   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3726   unsigned int element_bits = vector_element_bits (vectype);
3727   if (element_bits != memory_bits)
3728     /* For now the vector elements must be the same width as the
3729        memory elements.  */
3730     return false;
3731
3732   /* Work out which function we need.  */
3733   internal_fn ifn, alt_ifn;
3734   if (read_p)
3735     {
3736       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3737       alt_ifn = IFN_MASK_GATHER_LOAD;
3738     }
3739   else
3740     {
3741       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3742       alt_ifn = IFN_MASK_SCATTER_STORE;
3743     }
3744
3745   for (;;)
3746     {
3747       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3748       if (!offset_vectype)
3749         return false;
3750
3751       /* Test whether the target supports this combination.  */
3752       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3753                                                   offset_vectype, scale))
3754         {
3755           *ifn_out = ifn;
3756           *offset_vectype_out = offset_vectype;
3757           return true;
3758         }
3759       else if (!masked_p
3760                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3761                                                           memory_type,
3762                                                           offset_vectype,
3763                                                           scale))
3764         {
3765           *ifn_out = alt_ifn;
3766           *offset_vectype_out = offset_vectype;
3767           return true;
3768         }
3769
3770       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3771           && TYPE_PRECISION (offset_type) >= element_bits)
3772         return false;
3773
3774       offset_type = build_nonstandard_integer_type
3775         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3776     }
3777 }
3778
3779 /* STMT_INFO is a call to an internal gather load or scatter store function.
3780    Describe the operation in INFO.  */
3781
3782 static void
3783 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3784                                    gather_scatter_info *info)
3785 {
3786   gcall *call = as_a <gcall *> (stmt_info->stmt);
3787   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3788   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3789
3790   info->ifn = gimple_call_internal_fn (call);
3791   info->decl = NULL_TREE;
3792   info->base = gimple_call_arg (call, 0);
3793   info->offset = gimple_call_arg (call, 1);
3794   info->offset_dt = vect_unknown_def_type;
3795   info->offset_vectype = NULL_TREE;
3796   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3797   info->element_type = TREE_TYPE (vectype);
3798   info->memory_type = TREE_TYPE (DR_REF (dr));
3799 }
3800
3801 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3802    gather load or scatter store.  Describe the operation in *INFO if so.  */
3803
3804 bool
3805 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3806                            gather_scatter_info *info)
3807 {
3808   HOST_WIDE_INT scale = 1;
3809   poly_int64 pbitpos, pbitsize;
3810   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3811   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3812   tree offtype = NULL_TREE;
3813   tree decl = NULL_TREE, base, off;
3814   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3815   tree memory_type = TREE_TYPE (DR_REF (dr));
3816   machine_mode pmode;
3817   int punsignedp, reversep, pvolatilep = 0;
3818   internal_fn ifn;
3819   tree offset_vectype;
3820   bool masked_p = false;
3821
3822   /* See whether this is already a call to a gather/scatter internal function.
3823      If not, see whether it's a masked load or store.  */
3824   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3825   if (call && gimple_call_internal_p (call))
3826     {
3827       ifn = gimple_call_internal_fn (call);
3828       if (internal_gather_scatter_fn_p (ifn))
3829         {
3830           vect_describe_gather_scatter_call (stmt_info, info);
3831           return true;
3832         }
3833       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3834     }
3835
3836   /* True if we should aim to use internal functions rather than
3837      built-in functions.  */
3838   bool use_ifn_p = (DR_IS_READ (dr)
3839                     ? supports_vec_gather_load_p ()
3840                     : supports_vec_scatter_store_p ());
3841
3842   base = DR_REF (dr);
3843   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3844      see if we can use the def stmt of the address.  */
3845   if (masked_p
3846       && TREE_CODE (base) == MEM_REF
3847       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3848       && integer_zerop (TREE_OPERAND (base, 1))
3849       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3850     {
3851       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3852       if (is_gimple_assign (def_stmt)
3853           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3854         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3855     }
3856
3857   /* The gather and scatter builtins need address of the form
3858      loop_invariant + vector * {1, 2, 4, 8}
3859      or
3860      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3861      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3862      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3863      multiplications and additions in it.  To get a vector, we need
3864      a single SSA_NAME that will be defined in the loop and will
3865      contain everything that is not loop invariant and that can be
3866      vectorized.  The following code attempts to find such a preexistng
3867      SSA_NAME OFF and put the loop invariants into a tree BASE
3868      that can be gimplified before the loop.  */
3869   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3870                               &punsignedp, &reversep, &pvolatilep);
3871   if (reversep)
3872     return false;
3873
3874   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3875
3876   if (TREE_CODE (base) == MEM_REF)
3877     {
3878       if (!integer_zerop (TREE_OPERAND (base, 1)))
3879         {
3880           if (off == NULL_TREE)
3881             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3882           else
3883             off = size_binop (PLUS_EXPR, off,
3884                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3885         }
3886       base = TREE_OPERAND (base, 0);
3887     }
3888   else
3889     base = build_fold_addr_expr (base);
3890
3891   if (off == NULL_TREE)
3892     off = size_zero_node;
3893
3894   /* If base is not loop invariant, either off is 0, then we start with just
3895      the constant offset in the loop invariant BASE and continue with base
3896      as OFF, otherwise give up.
3897      We could handle that case by gimplifying the addition of base + off
3898      into some SSA_NAME and use that as off, but for now punt.  */
3899   if (!expr_invariant_in_loop_p (loop, base))
3900     {
3901       if (!integer_zerop (off))
3902         return false;
3903       off = base;
3904       base = size_int (pbytepos);
3905     }
3906   /* Otherwise put base + constant offset into the loop invariant BASE
3907      and continue with OFF.  */
3908   else
3909     {
3910       base = fold_convert (sizetype, base);
3911       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3912     }
3913
3914   /* OFF at this point may be either a SSA_NAME or some tree expression
3915      from get_inner_reference.  Try to peel off loop invariants from it
3916      into BASE as long as possible.  */
3917   STRIP_NOPS (off);
3918   while (offtype == NULL_TREE)
3919     {
3920       enum tree_code code;
3921       tree op0, op1, add = NULL_TREE;
3922
3923       if (TREE_CODE (off) == SSA_NAME)
3924         {
3925           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3926
3927           if (expr_invariant_in_loop_p (loop, off))
3928             return false;
3929
3930           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3931             break;
3932
3933           op0 = gimple_assign_rhs1 (def_stmt);
3934           code = gimple_assign_rhs_code (def_stmt);
3935           op1 = gimple_assign_rhs2 (def_stmt);
3936         }
3937       else
3938         {
3939           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3940             return false;
3941           code = TREE_CODE (off);
3942           extract_ops_from_tree (off, &code, &op0, &op1);
3943         }
3944       switch (code)
3945         {
3946         case POINTER_PLUS_EXPR:
3947         case PLUS_EXPR:
3948           if (expr_invariant_in_loop_p (loop, op0))
3949             {
3950               add = op0;
3951               off = op1;
3952             do_add:
3953               add = fold_convert (sizetype, add);
3954               if (scale != 1)
3955                 add = size_binop (MULT_EXPR, add, size_int (scale));
3956               base = size_binop (PLUS_EXPR, base, add);
3957               continue;
3958             }
3959           if (expr_invariant_in_loop_p (loop, op1))
3960             {
3961               add = op1;
3962               off = op0;
3963               goto do_add;
3964             }
3965           break;
3966         case MINUS_EXPR:
3967           if (expr_invariant_in_loop_p (loop, op1))
3968             {
3969               add = fold_convert (sizetype, op1);
3970               add = size_binop (MINUS_EXPR, size_zero_node, add);
3971               off = op0;
3972               goto do_add;
3973             }
3974           break;
3975         case MULT_EXPR:
3976           if (scale == 1 && tree_fits_shwi_p (op1))
3977             {
3978               int new_scale = tree_to_shwi (op1);
3979               /* Only treat this as a scaling operation if the target
3980                  supports it for at least some offset type.  */
3981               if (use_ifn_p
3982                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3983                                                 masked_p, vectype, memory_type,
3984                                                 signed_char_type_node,
3985                                                 new_scale, &ifn,
3986                                                 &offset_vectype)
3987                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3988                                                 masked_p, vectype, memory_type,
3989                                                 unsigned_char_type_node,
3990                                                 new_scale, &ifn,
3991                                                 &offset_vectype))
3992                 break;
3993               scale = new_scale;
3994               off = op0;
3995               continue;
3996             }
3997           break;
3998         case SSA_NAME:
3999           off = op0;
4000           continue;
4001         CASE_CONVERT:
4002           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4003               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4004             break;
4005
4006           /* Don't include the conversion if the target is happy with
4007              the current offset type.  */
4008           if (use_ifn_p
4009               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4010                                            masked_p, vectype, memory_type,
4011                                            TREE_TYPE (off), scale, &ifn,
4012                                            &offset_vectype))
4013             break;
4014
4015           if (TYPE_PRECISION (TREE_TYPE (op0))
4016               == TYPE_PRECISION (TREE_TYPE (off)))
4017             {
4018               off = op0;
4019               continue;
4020             }
4021
4022           /* Include the conversion if it is widening and we're using
4023              the IFN path or the target can handle the converted from
4024              offset or the current size is not already the same as the
4025              data vector element size.  */
4026           if ((TYPE_PRECISION (TREE_TYPE (op0))
4027                < TYPE_PRECISION (TREE_TYPE (off)))
4028               && (use_ifn_p
4029                   || (DR_IS_READ (dr)
4030                       ? (targetm.vectorize.builtin_gather
4031                          && targetm.vectorize.builtin_gather (vectype,
4032                                                               TREE_TYPE (op0),
4033                                                               scale))
4034                       : (targetm.vectorize.builtin_scatter
4035                          && targetm.vectorize.builtin_scatter (vectype,
4036                                                                TREE_TYPE (op0),
4037                                                                scale)))
4038                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4039                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4040             {
4041               off = op0;
4042               offtype = TREE_TYPE (off);
4043               STRIP_NOPS (off);
4044               continue;
4045             }
4046           break;
4047         default:
4048           break;
4049         }
4050       break;
4051     }
4052
4053   /* If at the end OFF still isn't a SSA_NAME or isn't
4054      defined in the loop, punt.  */
4055   if (TREE_CODE (off) != SSA_NAME
4056       || expr_invariant_in_loop_p (loop, off))
4057     return false;
4058
4059   if (offtype == NULL_TREE)
4060     offtype = TREE_TYPE (off);
4061
4062   if (use_ifn_p)
4063     {
4064       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4065                                      vectype, memory_type, offtype, scale,
4066                                      &ifn, &offset_vectype))
4067         ifn = IFN_LAST;
4068       decl = NULL_TREE;
4069     }
4070   else
4071     {
4072       if (DR_IS_READ (dr))
4073         {
4074           if (targetm.vectorize.builtin_gather)
4075             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4076         }
4077       else
4078         {
4079           if (targetm.vectorize.builtin_scatter)
4080             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4081         }
4082       ifn = IFN_LAST;
4083       /* The offset vector type will be read from DECL when needed.  */
4084       offset_vectype = NULL_TREE;
4085     }
4086
4087   info->ifn = ifn;
4088   info->decl = decl;
4089   info->base = base;
4090   info->offset = off;
4091   info->offset_dt = vect_unknown_def_type;
4092   info->offset_vectype = offset_vectype;
4093   info->scale = scale;
4094   info->element_type = TREE_TYPE (vectype);
4095   info->memory_type = memory_type;
4096   return true;
4097 }
4098
4099 /* Find the data references in STMT, analyze them with respect to LOOP and
4100    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4101    be handled.  */
4102
4103 opt_result
4104 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4105                                vec<data_reference_p> *datarefs,
4106                                vec<int> *dataref_groups, int group_id)
4107 {
4108   /* We can ignore clobbers for dataref analysis - they are removed during
4109      loop vectorization and BB vectorization checks dependences with a
4110      stmt walk.  */
4111   if (gimple_clobber_p (stmt))
4112     return opt_result::success ();
4113
4114   if (gimple_has_volatile_ops (stmt))
4115     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4116                                    stmt);
4117
4118   if (stmt_can_throw_internal (cfun, stmt))
4119     return opt_result::failure_at (stmt,
4120                                    "not vectorized:"
4121                                    " statement can throw an exception: %G",
4122                                    stmt);
4123
4124   auto_vec<data_reference_p, 2> refs;
4125   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4126   if (!res)
4127     return res;
4128
4129   if (refs.is_empty ())
4130     return opt_result::success ();
4131
4132   if (refs.length () > 1)
4133     {
4134       while (!refs.is_empty ())
4135         free_data_ref (refs.pop ());
4136       return opt_result::failure_at (stmt,
4137                                      "not vectorized: more than one "
4138                                      "data ref in stmt: %G", stmt);
4139     }
4140
4141   data_reference_p dr = refs.pop ();
4142   if (gcall *call = dyn_cast <gcall *> (stmt))
4143     if (!gimple_call_internal_p (call)
4144         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4145             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4146       {
4147         free_data_ref (dr);
4148         return opt_result::failure_at (stmt,
4149                                        "not vectorized: dr in a call %G", stmt);
4150       }
4151
4152   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4153       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4154     {
4155       free_data_ref (dr);
4156       return opt_result::failure_at (stmt,
4157                                      "not vectorized:"
4158                                      " statement is bitfield access %G", stmt);
4159     }
4160
4161   if (DR_BASE_ADDRESS (dr)
4162       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4163     {
4164       free_data_ref (dr);
4165       return opt_result::failure_at (stmt,
4166                                      "not vectorized:"
4167                                      " base addr of dr is a constant\n");
4168     }
4169
4170   /* Check whether this may be a SIMD lane access and adjust the
4171      DR to make it easier for us to handle it.  */
4172   if (loop
4173       && loop->simduid
4174       && (!DR_BASE_ADDRESS (dr)
4175           || !DR_OFFSET (dr)
4176           || !DR_INIT (dr)
4177           || !DR_STEP (dr)))
4178     {
4179       struct data_reference *newdr
4180         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4181                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4182       if (DR_BASE_ADDRESS (newdr)
4183           && DR_OFFSET (newdr)
4184           && DR_INIT (newdr)
4185           && DR_STEP (newdr)
4186           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4187           && integer_zerop (DR_STEP (newdr)))
4188         {
4189           tree base_address = DR_BASE_ADDRESS (newdr);
4190           tree off = DR_OFFSET (newdr);
4191           tree step = ssize_int (1);
4192           if (integer_zerop (off)
4193               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4194             {
4195               off = TREE_OPERAND (base_address, 1);
4196               base_address = TREE_OPERAND (base_address, 0);
4197             }
4198           STRIP_NOPS (off);
4199           if (TREE_CODE (off) == MULT_EXPR
4200               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4201             {
4202               step = TREE_OPERAND (off, 1);
4203               off = TREE_OPERAND (off, 0);
4204               STRIP_NOPS (off);
4205             }
4206           if (CONVERT_EXPR_P (off)
4207               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4208                   < TYPE_PRECISION (TREE_TYPE (off))))
4209             off = TREE_OPERAND (off, 0);
4210           if (TREE_CODE (off) == SSA_NAME)
4211             {
4212               gimple *def = SSA_NAME_DEF_STMT (off);
4213               /* Look through widening conversion.  */
4214               if (is_gimple_assign (def)
4215                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4216                 {
4217                   tree rhs1 = gimple_assign_rhs1 (def);
4218                   if (TREE_CODE (rhs1) == SSA_NAME
4219                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4220                       && (TYPE_PRECISION (TREE_TYPE (off))
4221                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4222                     def = SSA_NAME_DEF_STMT (rhs1);
4223                 }
4224               if (is_gimple_call (def)
4225                   && gimple_call_internal_p (def)
4226                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4227                 {
4228                   tree arg = gimple_call_arg (def, 0);
4229                   tree reft = TREE_TYPE (DR_REF (newdr));
4230                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4231                   arg = SSA_NAME_VAR (arg);
4232                   if (arg == loop->simduid
4233                       /* For now.  */
4234                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4235                     {
4236                       DR_BASE_ADDRESS (newdr) = base_address;
4237                       DR_OFFSET (newdr) = ssize_int (0);
4238                       DR_STEP (newdr) = step;
4239                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4240                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4241                       /* Mark as simd-lane access.  */
4242                       tree arg2 = gimple_call_arg (def, 1);
4243                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4244                       free_data_ref (dr);
4245                       datarefs->safe_push (newdr);
4246                       if (dataref_groups)
4247                         dataref_groups->safe_push (group_id);
4248                       return opt_result::success ();
4249                     }
4250                 }
4251             }
4252         }
4253       free_data_ref (newdr);
4254     }
4255
4256   datarefs->safe_push (dr);
4257   if (dataref_groups)
4258     dataref_groups->safe_push (group_id);
4259   return opt_result::success ();
4260 }
4261
4262 /* Function vect_analyze_data_refs.
4263
4264   Find all the data references in the loop or basic block.
4265
4266    The general structure of the analysis of data refs in the vectorizer is as
4267    follows:
4268    1- vect_analyze_data_refs(loop/bb): call
4269       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4270       in the loop/bb and their dependences.
4271    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4272    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4273    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4274
4275 */
4276
4277 opt_result
4278 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4279 {
4280   class loop *loop = NULL;
4281   unsigned int i;
4282   struct data_reference *dr;
4283   tree scalar_type;
4284
4285   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4286
4287   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4288     loop = LOOP_VINFO_LOOP (loop_vinfo);
4289
4290   /* Go through the data-refs, check that the analysis succeeded.  Update
4291      pointer from stmt_vec_info struct to DR and vectype.  */
4292
4293   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4294   FOR_EACH_VEC_ELT (datarefs, i, dr)
4295     {
4296       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4297       poly_uint64 vf;
4298
4299       gcc_assert (DR_REF (dr));
4300       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4301       gcc_assert (!stmt_info->dr_aux.dr);
4302       stmt_info->dr_aux.dr = dr;
4303       stmt_info->dr_aux.stmt = stmt_info;
4304
4305       /* Check that analysis of the data-ref succeeded.  */
4306       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4307           || !DR_STEP (dr))
4308         {
4309           bool maybe_gather
4310             = DR_IS_READ (dr)
4311               && !TREE_THIS_VOLATILE (DR_REF (dr));
4312           bool maybe_scatter
4313             = DR_IS_WRITE (dr)
4314               && !TREE_THIS_VOLATILE (DR_REF (dr))
4315               && (targetm.vectorize.builtin_scatter != NULL
4316                   || supports_vec_scatter_store_p ());
4317
4318           /* If target supports vector gather loads or scatter stores,
4319              see if they can't be used.  */
4320           if (is_a <loop_vec_info> (vinfo)
4321               && !nested_in_vect_loop_p (loop, stmt_info))
4322             {
4323               if (maybe_gather || maybe_scatter)
4324                 {
4325                   if (maybe_gather)
4326                     gatherscatter = GATHER;
4327                   else
4328                     gatherscatter = SCATTER;
4329                 }
4330             }
4331
4332           if (gatherscatter == SG_NONE)
4333             {
4334               if (dump_enabled_p ())
4335                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4336                                  "not vectorized: data ref analysis "
4337                                  "failed %G", stmt_info->stmt);
4338               if (is_a <bb_vec_info> (vinfo))
4339                 {
4340                   /* In BB vectorization the ref can still participate
4341                      in dependence analysis, we just can't vectorize it.  */
4342                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4343                   continue;
4344                 }
4345               return opt_result::failure_at (stmt_info->stmt,
4346                                              "not vectorized:"
4347                                              " data ref analysis failed: %G",
4348                                              stmt_info->stmt);
4349             }
4350         }
4351
4352       /* See if this was detected as SIMD lane access.  */
4353       if (dr->aux == (void *)-1
4354           || dr->aux == (void *)-2
4355           || dr->aux == (void *)-3
4356           || dr->aux == (void *)-4)
4357         {
4358           if (nested_in_vect_loop_p (loop, stmt_info))
4359             return opt_result::failure_at (stmt_info->stmt,
4360                                            "not vectorized:"
4361                                            " data ref analysis failed: %G",
4362                                            stmt_info->stmt);
4363           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4364             = -(uintptr_t) dr->aux;
4365         }
4366
4367       tree base = get_base_address (DR_REF (dr));
4368       if (base && VAR_P (base) && DECL_NONALIASED (base))
4369         {
4370           if (dump_enabled_p ())
4371             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4372                              "not vectorized: base object not addressable "
4373                              "for stmt: %G", stmt_info->stmt);
4374           if (is_a <bb_vec_info> (vinfo))
4375             {
4376               /* In BB vectorization the ref can still participate
4377                  in dependence analysis, we just can't vectorize it.  */
4378               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4379               continue;
4380             }
4381           return opt_result::failure_at (stmt_info->stmt,
4382                                          "not vectorized: base object not"
4383                                          " addressable for stmt: %G",
4384                                          stmt_info->stmt);
4385         }
4386
4387       if (is_a <loop_vec_info> (vinfo)
4388           && DR_STEP (dr)
4389           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4390         {
4391           if (nested_in_vect_loop_p (loop, stmt_info))
4392             return opt_result::failure_at (stmt_info->stmt,
4393                                            "not vectorized: "
4394                                            "not suitable for strided load %G",
4395                                            stmt_info->stmt);
4396           STMT_VINFO_STRIDED_P (stmt_info) = true;
4397         }
4398
4399       /* Update DR field in stmt_vec_info struct.  */
4400
4401       /* If the dataref is in an inner-loop of the loop that is considered for
4402          for vectorization, we also want to analyze the access relative to
4403          the outer-loop (DR contains information only relative to the
4404          inner-most enclosing loop).  We do that by building a reference to the
4405          first location accessed by the inner-loop, and analyze it relative to
4406          the outer-loop.  */
4407       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4408         {
4409           /* Build a reference to the first location accessed by the
4410              inner loop: *(BASE + INIT + OFFSET).  By construction,
4411              this address must be invariant in the inner loop, so we
4412              can consider it as being used in the outer loop.  */
4413           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4414           tree offset = unshare_expr (DR_OFFSET (dr));
4415           tree init = unshare_expr (DR_INIT (dr));
4416           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4417                                           init, offset);
4418           tree init_addr = fold_build_pointer_plus (base, init_offset);
4419           tree init_ref = build_fold_indirect_ref (init_addr);
4420
4421           if (dump_enabled_p ())
4422             dump_printf_loc (MSG_NOTE, vect_location,
4423                              "analyze in outer loop: %T\n", init_ref);
4424
4425           opt_result res
4426             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4427                                     init_ref, loop, stmt_info->stmt);
4428           if (!res)
4429             /* dr_analyze_innermost already explained the failure.  */
4430             return res;
4431
4432           if (dump_enabled_p ())
4433             dump_printf_loc (MSG_NOTE, vect_location,
4434                              "\touter base_address: %T\n"
4435                              "\touter offset from base address: %T\n"
4436                              "\touter constant offset from base address: %T\n"
4437                              "\touter step: %T\n"
4438                              "\touter base alignment: %d\n\n"
4439                              "\touter base misalignment: %d\n"
4440                              "\touter offset alignment: %d\n"
4441                              "\touter step alignment: %d\n",
4442                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4443                              STMT_VINFO_DR_OFFSET (stmt_info),
4444                              STMT_VINFO_DR_INIT (stmt_info),
4445                              STMT_VINFO_DR_STEP (stmt_info),
4446                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4447                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4448                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4449                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4450         }
4451
4452       /* Set vectype for STMT.  */
4453       scalar_type = TREE_TYPE (DR_REF (dr));
4454       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4455       if (!vectype)
4456         {
4457           if (dump_enabled_p ())
4458             {
4459               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4460                                "not vectorized: no vectype for stmt: %G",
4461                                stmt_info->stmt);
4462               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4463               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4464                                  scalar_type);
4465               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4466             }
4467
4468           if (is_a <bb_vec_info> (vinfo))
4469             {
4470               /* No vector type is fine, the ref can still participate
4471                  in dependence analysis, we just can't vectorize it.  */
4472               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4473               continue;
4474             }
4475           if (fatal)
4476             *fatal = false;
4477           return opt_result::failure_at (stmt_info->stmt,
4478                                          "not vectorized:"
4479                                          " no vectype for stmt: %G"
4480                                          " scalar_type: %T\n",
4481                                          stmt_info->stmt, scalar_type);
4482         }
4483       else
4484         {
4485           if (dump_enabled_p ())
4486             dump_printf_loc (MSG_NOTE, vect_location,
4487                              "got vectype for stmt: %G%T\n",
4488                              stmt_info->stmt, vectype);
4489         }
4490
4491       /* Adjust the minimal vectorization factor according to the
4492          vector type.  */
4493       vf = TYPE_VECTOR_SUBPARTS (vectype);
4494       *min_vf = upper_bound (*min_vf, vf);
4495
4496       /* Leave the BB vectorizer to pick the vector type later, based on
4497          the final dataref group size and SLP node size.  */
4498       if (is_a <loop_vec_info> (vinfo))
4499         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4500
4501       if (gatherscatter != SG_NONE)
4502         {
4503           gather_scatter_info gs_info;
4504           if (!vect_check_gather_scatter (stmt_info,
4505                                           as_a <loop_vec_info> (vinfo),
4506                                           &gs_info)
4507               || !get_vectype_for_scalar_type (vinfo,
4508                                                TREE_TYPE (gs_info.offset)))
4509             {
4510               if (fatal)
4511                 *fatal = false;
4512               return opt_result::failure_at
4513                         (stmt_info->stmt,
4514                          (gatherscatter == GATHER)
4515                          ? "not vectorized: not suitable for gather load %G"
4516                          : "not vectorized: not suitable for scatter store %G",
4517                          stmt_info->stmt);
4518             }
4519           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4520         }
4521     }
4522
4523   /* We used to stop processing and prune the list here.  Verify we no
4524      longer need to.  */
4525   gcc_assert (i == datarefs.length ());
4526
4527   return opt_result::success ();
4528 }
4529
4530
4531 /* Function vect_get_new_vect_var.
4532
4533    Returns a name for a new variable.  The current naming scheme appends the
4534    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4535    the name of vectorizer generated variables, and appends that to NAME if
4536    provided.  */
4537
4538 tree
4539 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4540 {
4541   const char *prefix;
4542   tree new_vect_var;
4543
4544   switch (var_kind)
4545   {
4546   case vect_simple_var:
4547     prefix = "vect";
4548     break;
4549   case vect_scalar_var:
4550     prefix = "stmp";
4551     break;
4552   case vect_mask_var:
4553     prefix = "mask";
4554     break;
4555   case vect_pointer_var:
4556     prefix = "vectp";
4557     break;
4558   default:
4559     gcc_unreachable ();
4560   }
4561
4562   if (name)
4563     {
4564       char* tmp = concat (prefix, "_", name, NULL);
4565       new_vect_var = create_tmp_reg (type, tmp);
4566       free (tmp);
4567     }
4568   else
4569     new_vect_var = create_tmp_reg (type, prefix);
4570
4571   return new_vect_var;
4572 }
4573
4574 /* Like vect_get_new_vect_var but return an SSA name.  */
4575
4576 tree
4577 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4578 {
4579   const char *prefix;
4580   tree new_vect_var;
4581
4582   switch (var_kind)
4583   {
4584   case vect_simple_var:
4585     prefix = "vect";
4586     break;
4587   case vect_scalar_var:
4588     prefix = "stmp";
4589     break;
4590   case vect_pointer_var:
4591     prefix = "vectp";
4592     break;
4593   default:
4594     gcc_unreachable ();
4595   }
4596
4597   if (name)
4598     {
4599       char* tmp = concat (prefix, "_", name, NULL);
4600       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4601       free (tmp);
4602     }
4603   else
4604     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4605
4606   return new_vect_var;
4607 }
4608
4609 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4610
4611 static void
4612 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4613 {
4614   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4615   int misalign = DR_MISALIGNMENT (dr_info);
4616   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4617     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4618   else
4619     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4620                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4621                             misalign);
4622 }
4623
4624 /* Function vect_create_addr_base_for_vector_ref.
4625
4626    Create an expression that computes the address of the first memory location
4627    that will be accessed for a data reference.
4628
4629    Input:
4630    STMT_INFO: The statement containing the data reference.
4631    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4632    OFFSET: Optional. If supplied, it is be added to the initial address.
4633    LOOP:    Specify relative to which loop-nest should the address be computed.
4634             For example, when the dataref is in an inner-loop nested in an
4635             outer-loop that is now being vectorized, LOOP can be either the
4636             outer-loop, or the inner-loop.  The first memory location accessed
4637             by the following dataref ('in' points to short):
4638
4639                 for (i=0; i<N; i++)
4640                    for (j=0; j<M; j++)
4641                      s += in[i+j]
4642
4643             is as follows:
4644             if LOOP=i_loop:     &in             (relative to i_loop)
4645             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4646    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4647             initial address.  Unlike OFFSET, which is number of elements to
4648             be added, BYTE_OFFSET is measured in bytes.
4649
4650    Output:
4651    1. Return an SSA_NAME whose value is the address of the memory location of
4652       the first vector of the data reference.
4653    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4654       these statement(s) which define the returned SSA_NAME.
4655
4656    FORNOW: We are only handling array accesses with step 1.  */
4657
4658 tree
4659 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4660                                       gimple_seq *new_stmt_list,
4661                                       tree offset,
4662                                       tree byte_offset)
4663 {
4664   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4665   struct data_reference *dr = dr_info->dr;
4666   const char *base_name;
4667   tree addr_base;
4668   tree dest;
4669   gimple_seq seq = NULL;
4670   tree vect_ptr_type;
4671   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4672   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4673   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4674
4675   tree data_ref_base = unshare_expr (drb->base_address);
4676   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4677   tree init = unshare_expr (drb->init);
4678
4679   if (loop_vinfo)
4680     base_name = get_name (data_ref_base);
4681   else
4682     {
4683       base_offset = ssize_int (0);
4684       init = ssize_int (0);
4685       base_name = get_name (DR_REF (dr));
4686     }
4687
4688   /* Create base_offset */
4689   base_offset = size_binop (PLUS_EXPR,
4690                             fold_convert (sizetype, base_offset),
4691                             fold_convert (sizetype, init));
4692
4693   if (offset)
4694     {
4695       offset = fold_build2 (MULT_EXPR, sizetype,
4696                             fold_convert (sizetype, offset), step);
4697       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4698                                  base_offset, offset);
4699     }
4700   if (byte_offset)
4701     {
4702       byte_offset = fold_convert (sizetype, byte_offset);
4703       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4704                                  base_offset, byte_offset);
4705     }
4706
4707   /* base + base_offset */
4708   if (loop_vinfo)
4709     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4710   else
4711     {
4712       addr_base = build1 (ADDR_EXPR,
4713                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4714                           unshare_expr (DR_REF (dr)));
4715     }
4716
4717   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4718   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4719   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4720   gimple_seq_add_seq (new_stmt_list, seq);
4721
4722   if (DR_PTR_INFO (dr)
4723       && TREE_CODE (addr_base) == SSA_NAME
4724       && !SSA_NAME_PTR_INFO (addr_base))
4725     {
4726       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4727       if (offset || byte_offset)
4728         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4729     }
4730
4731   if (dump_enabled_p ())
4732     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4733
4734   return addr_base;
4735 }
4736
4737
4738 /* Function vect_create_data_ref_ptr.
4739
4740    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4741    location accessed in the loop by STMT_INFO, along with the def-use update
4742    chain to appropriately advance the pointer through the loop iterations.
4743    Also set aliasing information for the pointer.  This pointer is used by
4744    the callers to this function to create a memory reference expression for
4745    vector load/store access.
4746
4747    Input:
4748    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4749          GIMPLE_ASSIGN <name, data-ref> or
4750          GIMPLE_ASSIGN <data-ref, name>.
4751    2. AGGR_TYPE: the type of the reference, which should be either a vector
4752         or an array.
4753    3. AT_LOOP: the loop where the vector memref is to be created.
4754    4. OFFSET (optional): an offset to be added to the initial address accessed
4755         by the data-ref in STMT_INFO.
4756    5. BSI: location where the new stmts are to be placed if there is no loop
4757    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4758         pointing to the initial address.
4759    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4760         to the initial address accessed by the data-ref in STMT_INFO.  This is
4761         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4762         in bytes.
4763    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4764         to the IV during each iteration of the loop.  NULL says to move
4765         by one copy of AGGR_TYPE up or down, depending on the step of the
4766         data reference.
4767
4768    Output:
4769    1. Declare a new ptr to vector_type, and have it point to the base of the
4770       data reference (initial addressed accessed by the data reference).
4771       For example, for vector of type V8HI, the following code is generated:
4772
4773       v8hi *ap;
4774       ap = (v8hi *)initial_address;
4775
4776       if OFFSET is not supplied:
4777          initial_address = &a[init];
4778       if OFFSET is supplied:
4779          initial_address = &a[init + OFFSET];
4780       if BYTE_OFFSET is supplied:
4781          initial_address = &a[init] + BYTE_OFFSET;
4782
4783       Return the initial_address in INITIAL_ADDRESS.
4784
4785    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4786       update the pointer in each iteration of the loop.
4787
4788       Return the increment stmt that updates the pointer in PTR_INCR.
4789
4790    3. Return the pointer.  */
4791
4792 tree
4793 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4794                           tree aggr_type, class loop *at_loop, tree offset,
4795                           tree *initial_address, gimple_stmt_iterator *gsi,
4796                           gimple **ptr_incr, bool only_init,
4797                           tree byte_offset, tree iv_step)
4798 {
4799   const char *base_name;
4800   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4801   class loop *loop = NULL;
4802   bool nested_in_vect_loop = false;
4803   class loop *containing_loop = NULL;
4804   tree aggr_ptr_type;
4805   tree aggr_ptr;
4806   tree new_temp;
4807   gimple_seq new_stmt_list = NULL;
4808   edge pe = NULL;
4809   basic_block new_bb;
4810   tree aggr_ptr_init;
4811   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4812   struct data_reference *dr = dr_info->dr;
4813   tree aptr;
4814   gimple_stmt_iterator incr_gsi;
4815   bool insert_after;
4816   tree indx_before_incr, indx_after_incr;
4817   gimple *incr;
4818   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4819
4820   gcc_assert (iv_step != NULL_TREE
4821               || TREE_CODE (aggr_type) == ARRAY_TYPE
4822               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4823
4824   if (loop_vinfo)
4825     {
4826       loop = LOOP_VINFO_LOOP (loop_vinfo);
4827       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4828       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4829       pe = loop_preheader_edge (loop);
4830     }
4831   else
4832     {
4833       gcc_assert (bb_vinfo);
4834       only_init = true;
4835       *ptr_incr = NULL;
4836     }
4837
4838   /* Create an expression for the first address accessed by this load
4839      in LOOP.  */
4840   base_name = get_name (DR_BASE_ADDRESS (dr));
4841
4842   if (dump_enabled_p ())
4843     {
4844       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4845       dump_printf_loc (MSG_NOTE, vect_location,
4846                        "create %s-pointer variable to type: %T",
4847                        get_tree_code_name (TREE_CODE (aggr_type)),
4848                        aggr_type);
4849       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4850         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4851       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4852         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4853       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4854         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4855       else
4856         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4857       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4858     }
4859
4860   /* (1) Create the new aggregate-pointer variable.
4861      Vector and array types inherit the alias set of their component
4862      type by default so we need to use a ref-all pointer if the data
4863      reference does not conflict with the created aggregated data
4864      reference because it is not addressable.  */
4865   bool need_ref_all = false;
4866   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4867                               get_alias_set (DR_REF (dr))))
4868     need_ref_all = true;
4869   /* Likewise for any of the data references in the stmt group.  */
4870   else if (DR_GROUP_SIZE (stmt_info) > 1)
4871     {
4872       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4873       do
4874         {
4875           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4876           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4877                                       get_alias_set (DR_REF (sdr))))
4878             {
4879               need_ref_all = true;
4880               break;
4881             }
4882           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4883         }
4884       while (sinfo);
4885     }
4886   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4887                                                need_ref_all);
4888   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4889
4890
4891   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4892      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4893      def-use update cycles for the pointer: one relative to the outer-loop
4894      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4895      to the inner-loop (which is the inner-most loop containing the dataref),
4896      and this is done be step (5) below.
4897
4898      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4899      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4900      redundant.  Steps (3),(4) create the following:
4901
4902         vp0 = &base_addr;
4903         LOOP:   vp1 = phi(vp0,vp2)
4904                 ...
4905                 ...
4906                 vp2 = vp1 + step
4907                 goto LOOP
4908
4909      If there is an inner-loop nested in loop, then step (5) will also be
4910      applied, and an additional update in the inner-loop will be created:
4911
4912         vp0 = &base_addr;
4913         LOOP:   vp1 = phi(vp0,vp2)
4914                 ...
4915         inner:     vp3 = phi(vp1,vp4)
4916                    vp4 = vp3 + inner_step
4917                    if () goto inner
4918                 ...
4919                 vp2 = vp1 + step
4920                 if () goto LOOP   */
4921
4922   /* (2) Calculate the initial address of the aggregate-pointer, and set
4923      the aggregate-pointer to point to it before the loop.  */
4924
4925   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4926
4927   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4928                                                    stmt_info, &new_stmt_list,
4929                                                    offset, byte_offset);
4930   if (new_stmt_list)
4931     {
4932       if (pe)
4933         {
4934           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4935           gcc_assert (!new_bb);
4936         }
4937       else
4938         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4939     }
4940
4941   *initial_address = new_temp;
4942   aggr_ptr_init = new_temp;
4943
4944   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4945      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4946      inner-loop nested in LOOP (during outer-loop vectorization).  */
4947
4948   /* No update in loop is required.  */
4949   if (only_init && (!loop_vinfo || at_loop == loop))
4950     aptr = aggr_ptr_init;
4951   else
4952     {
4953       /* Accesses to invariant addresses should be handled specially
4954          by the caller.  */
4955       tree step = vect_dr_behavior (vinfo, dr_info)->step;
4956       gcc_assert (!integer_zerop (step));
4957
4958       if (iv_step == NULL_TREE)
4959         {
4960           /* The step of the aggregate pointer is the type size,
4961              negated for downward accesses.  */
4962           iv_step = TYPE_SIZE_UNIT (aggr_type);
4963           if (tree_int_cst_sgn (step) == -1)
4964             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4965         }
4966
4967       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4968
4969       create_iv (aggr_ptr_init,
4970                  fold_convert (aggr_ptr_type, iv_step),
4971                  aggr_ptr, loop, &incr_gsi, insert_after,
4972                  &indx_before_incr, &indx_after_incr);
4973       incr = gsi_stmt (incr_gsi);
4974
4975       /* Copy the points-to information if it exists. */
4976       if (DR_PTR_INFO (dr))
4977         {
4978           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4979           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4980         }
4981       if (ptr_incr)
4982         *ptr_incr = incr;
4983
4984       aptr = indx_before_incr;
4985     }
4986
4987   if (!nested_in_vect_loop || only_init)
4988     return aptr;
4989
4990
4991   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4992      nested in LOOP, if exists.  */
4993
4994   gcc_assert (nested_in_vect_loop);
4995   if (!only_init)
4996     {
4997       standard_iv_increment_position (containing_loop, &incr_gsi,
4998                                       &insert_after);
4999       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5000                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5001                  &indx_after_incr);
5002       incr = gsi_stmt (incr_gsi);
5003
5004       /* Copy the points-to information if it exists. */
5005       if (DR_PTR_INFO (dr))
5006         {
5007           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5008           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5009         }
5010       if (ptr_incr)
5011         *ptr_incr = incr;
5012
5013       return indx_before_incr;
5014     }
5015   else
5016     gcc_unreachable ();
5017 }
5018
5019
5020 /* Function bump_vector_ptr
5021
5022    Increment a pointer (to a vector type) by vector-size. If requested,
5023    i.e. if PTR-INCR is given, then also connect the new increment stmt
5024    to the existing def-use update-chain of the pointer, by modifying
5025    the PTR_INCR as illustrated below:
5026
5027    The pointer def-use update-chain before this function:
5028                         DATAREF_PTR = phi (p_0, p_2)
5029                         ....
5030         PTR_INCR:       p_2 = DATAREF_PTR + step
5031
5032    The pointer def-use update-chain after this function:
5033                         DATAREF_PTR = phi (p_0, p_2)
5034                         ....
5035                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5036                         ....
5037         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5038
5039    Input:
5040    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5041                  in the loop.
5042    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5043               the loop.  The increment amount across iterations is expected
5044               to be vector_size.
5045    BSI - location where the new update stmt is to be placed.
5046    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5047    BUMP - optional. The offset by which to bump the pointer. If not given,
5048           the offset is assumed to be vector_size.
5049
5050    Output: Return NEW_DATAREF_PTR as illustrated above.
5051
5052 */
5053
5054 tree
5055 bump_vector_ptr (vec_info *vinfo,
5056                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5057                  stmt_vec_info stmt_info, tree bump)
5058 {
5059   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5060   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5061   tree update = TYPE_SIZE_UNIT (vectype);
5062   gimple *incr_stmt;
5063   ssa_op_iter iter;
5064   use_operand_p use_p;
5065   tree new_dataref_ptr;
5066
5067   if (bump)
5068     update = bump;
5069
5070   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5071     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5072   else
5073     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5074   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5075                                    dataref_ptr, update);
5076   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5077   /* Fold the increment, avoiding excessive chains use-def chains of
5078      those, leading to compile-time issues for passes until the next
5079      forwprop pass which would do this as well.  */
5080   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5081   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5082     {
5083       incr_stmt = gsi_stmt (fold_gsi);
5084       update_stmt (incr_stmt);
5085     }
5086
5087   /* Copy the points-to information if it exists. */
5088   if (DR_PTR_INFO (dr))
5089     {
5090       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5091       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5092     }
5093
5094   if (!ptr_incr)
5095     return new_dataref_ptr;
5096
5097   /* Update the vector-pointer's cross-iteration increment.  */
5098   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5099     {
5100       tree use = USE_FROM_PTR (use_p);
5101
5102       if (use == dataref_ptr)
5103         SET_USE (use_p, new_dataref_ptr);
5104       else
5105         gcc_assert (operand_equal_p (use, update, 0));
5106     }
5107
5108   return new_dataref_ptr;
5109 }
5110
5111
5112 /* Copy memory reference info such as base/clique from the SRC reference
5113    to the DEST MEM_REF.  */
5114
5115 void
5116 vect_copy_ref_info (tree dest, tree src)
5117 {
5118   if (TREE_CODE (dest) != MEM_REF)
5119     return;
5120
5121   tree src_base = src;
5122   while (handled_component_p (src_base))
5123     src_base = TREE_OPERAND (src_base, 0);
5124   if (TREE_CODE (src_base) != MEM_REF
5125       && TREE_CODE (src_base) != TARGET_MEM_REF)
5126     return;
5127
5128   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5129   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5130 }
5131
5132
5133 /* Function vect_create_destination_var.
5134
5135    Create a new temporary of type VECTYPE.  */
5136
5137 tree
5138 vect_create_destination_var (tree scalar_dest, tree vectype)
5139 {
5140   tree vec_dest;
5141   const char *name;
5142   char *new_name;
5143   tree type;
5144   enum vect_var_kind kind;
5145
5146   kind = vectype
5147     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5148     ? vect_mask_var
5149     : vect_simple_var
5150     : vect_scalar_var;
5151   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5152
5153   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5154
5155   name = get_name (scalar_dest);
5156   if (name)
5157     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5158   else
5159     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5160   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5161   free (new_name);
5162
5163   return vec_dest;
5164 }
5165
5166 /* Function vect_grouped_store_supported.
5167
5168    Returns TRUE if interleave high and interleave low permutations
5169    are supported, and FALSE otherwise.  */
5170
5171 bool
5172 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5173 {
5174   machine_mode mode = TYPE_MODE (vectype);
5175
5176   /* vect_permute_store_chain requires the group size to be equal to 3 or
5177      be a power of two.  */
5178   if (count != 3 && exact_log2 (count) == -1)
5179     {
5180       if (dump_enabled_p ())
5181         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5182                          "the size of the group of accesses"
5183                          " is not a power of 2 or not eqaul to 3\n");
5184       return false;
5185     }
5186
5187   /* Check that the permutation is supported.  */
5188   if (VECTOR_MODE_P (mode))
5189     {
5190       unsigned int i;
5191       if (count == 3)
5192         {
5193           unsigned int j0 = 0, j1 = 0, j2 = 0;
5194           unsigned int i, j;
5195
5196           unsigned int nelt;
5197           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5198             {
5199               if (dump_enabled_p ())
5200                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5201                                  "cannot handle groups of 3 stores for"
5202                                  " variable-length vectors\n");
5203               return false;
5204             }
5205
5206           vec_perm_builder sel (nelt, nelt, 1);
5207           sel.quick_grow (nelt);
5208           vec_perm_indices indices;
5209           for (j = 0; j < 3; j++)
5210             {
5211               int nelt0 = ((3 - j) * nelt) % 3;
5212               int nelt1 = ((3 - j) * nelt + 1) % 3;
5213               int nelt2 = ((3 - j) * nelt + 2) % 3;
5214               for (i = 0; i < nelt; i++)
5215                 {
5216                   if (3 * i + nelt0 < nelt)
5217                     sel[3 * i + nelt0] = j0++;
5218                   if (3 * i + nelt1 < nelt)
5219                     sel[3 * i + nelt1] = nelt + j1++;
5220                   if (3 * i + nelt2 < nelt)
5221                     sel[3 * i + nelt2] = 0;
5222                 }
5223               indices.new_vector (sel, 2, nelt);
5224               if (!can_vec_perm_const_p (mode, indices))
5225                 {
5226                   if (dump_enabled_p ())
5227                     dump_printf (MSG_MISSED_OPTIMIZATION,
5228                                  "permutation op not supported by target.\n");
5229                   return false;
5230                 }
5231
5232               for (i = 0; i < nelt; i++)
5233                 {
5234                   if (3 * i + nelt0 < nelt)
5235                     sel[3 * i + nelt0] = 3 * i + nelt0;
5236                   if (3 * i + nelt1 < nelt)
5237                     sel[3 * i + nelt1] = 3 * i + nelt1;
5238                   if (3 * i + nelt2 < nelt)
5239                     sel[3 * i + nelt2] = nelt + j2++;
5240                 }
5241               indices.new_vector (sel, 2, nelt);
5242               if (!can_vec_perm_const_p (mode, indices))
5243                 {
5244                   if (dump_enabled_p ())
5245                     dump_printf (MSG_MISSED_OPTIMIZATION,
5246                                  "permutation op not supported by target.\n");
5247                   return false;
5248                 }
5249             }
5250           return true;
5251         }
5252       else
5253         {
5254           /* If length is not equal to 3 then only power of 2 is supported.  */
5255           gcc_assert (pow2p_hwi (count));
5256           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5257
5258           /* The encoding has 2 interleaved stepped patterns.  */
5259           vec_perm_builder sel (nelt, 2, 3);
5260           sel.quick_grow (6);
5261           for (i = 0; i < 3; i++)
5262             {
5263               sel[i * 2] = i;
5264               sel[i * 2 + 1] = i + nelt;
5265             }
5266           vec_perm_indices indices (sel, 2, nelt);
5267           if (can_vec_perm_const_p (mode, indices))
5268             {
5269               for (i = 0; i < 6; i++)
5270                 sel[i] += exact_div (nelt, 2);
5271               indices.new_vector (sel, 2, nelt);
5272               if (can_vec_perm_const_p (mode, indices))
5273                 return true;
5274             }
5275         }
5276     }
5277
5278   if (dump_enabled_p ())
5279     dump_printf (MSG_MISSED_OPTIMIZATION,
5280                  "permutation op not supported by target.\n");
5281   return false;
5282 }
5283
5284
5285 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5286    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5287
5288 bool
5289 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5290                             bool masked_p)
5291 {
5292   if (masked_p)
5293     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5294                                          vec_mask_store_lanes_optab,
5295                                          vectype, count);
5296   else
5297     return vect_lanes_optab_supported_p ("vec_store_lanes",
5298                                          vec_store_lanes_optab,
5299                                          vectype, count);
5300 }
5301
5302
5303 /* Function vect_permute_store_chain.
5304
5305    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5306    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5307    the data correctly for the stores.  Return the final references for stores
5308    in RESULT_CHAIN.
5309
5310    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5311    The input is 4 vectors each containing 8 elements.  We assign a number to
5312    each element, the input sequence is:
5313
5314    1st vec:   0  1  2  3  4  5  6  7
5315    2nd vec:   8  9 10 11 12 13 14 15
5316    3rd vec:  16 17 18 19 20 21 22 23
5317    4th vec:  24 25 26 27 28 29 30 31
5318
5319    The output sequence should be:
5320
5321    1st vec:  0  8 16 24  1  9 17 25
5322    2nd vec:  2 10 18 26  3 11 19 27
5323    3rd vec:  4 12 20 28  5 13 21 30
5324    4th vec:  6 14 22 30  7 15 23 31
5325
5326    i.e., we interleave the contents of the four vectors in their order.
5327
5328    We use interleave_high/low instructions to create such output.  The input of
5329    each interleave_high/low operation is two vectors:
5330    1st vec    2nd vec
5331    0 1 2 3    4 5 6 7
5332    the even elements of the result vector are obtained left-to-right from the
5333    high/low elements of the first vector.  The odd elements of the result are
5334    obtained left-to-right from the high/low elements of the second vector.
5335    The output of interleave_high will be:   0 4 1 5
5336    and of interleave_low:                   2 6 3 7
5337
5338
5339    The permutation is done in log LENGTH stages.  In each stage interleave_high
5340    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5341    where the first argument is taken from the first half of DR_CHAIN and the
5342    second argument from it's second half.
5343    In our example,
5344
5345    I1: interleave_high (1st vec, 3rd vec)
5346    I2: interleave_low (1st vec, 3rd vec)
5347    I3: interleave_high (2nd vec, 4th vec)
5348    I4: interleave_low (2nd vec, 4th vec)
5349
5350    The output for the first stage is:
5351
5352    I1:  0 16  1 17  2 18  3 19
5353    I2:  4 20  5 21  6 22  7 23
5354    I3:  8 24  9 25 10 26 11 27
5355    I4: 12 28 13 29 14 30 15 31
5356
5357    The output of the second stage, i.e. the final result is:
5358
5359    I1:  0  8 16 24  1  9 17 25
5360    I2:  2 10 18 26  3 11 19 27
5361    I3:  4 12 20 28  5 13 21 30
5362    I4:  6 14 22 30  7 15 23 31.  */
5363
5364 void
5365 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5366                           unsigned int length,
5367                           stmt_vec_info stmt_info,
5368                           gimple_stmt_iterator *gsi,
5369                           vec<tree> *result_chain)
5370 {
5371   tree vect1, vect2, high, low;
5372   gimple *perm_stmt;
5373   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5374   tree perm_mask_low, perm_mask_high;
5375   tree data_ref;
5376   tree perm3_mask_low, perm3_mask_high;
5377   unsigned int i, j, n, log_length = exact_log2 (length);
5378
5379   result_chain->quick_grow (length);
5380   memcpy (result_chain->address (), dr_chain.address (),
5381           length * sizeof (tree));
5382
5383   if (length == 3)
5384     {
5385       /* vect_grouped_store_supported ensures that this is constant.  */
5386       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5387       unsigned int j0 = 0, j1 = 0, j2 = 0;
5388
5389       vec_perm_builder sel (nelt, nelt, 1);
5390       sel.quick_grow (nelt);
5391       vec_perm_indices indices;
5392       for (j = 0; j < 3; j++)
5393         {
5394           int nelt0 = ((3 - j) * nelt) % 3;
5395           int nelt1 = ((3 - j) * nelt + 1) % 3;
5396           int nelt2 = ((3 - j) * nelt + 2) % 3;
5397
5398           for (i = 0; i < nelt; i++)
5399             {
5400               if (3 * i + nelt0 < nelt)
5401                 sel[3 * i + nelt0] = j0++;
5402               if (3 * i + nelt1 < nelt)
5403                 sel[3 * i + nelt1] = nelt + j1++;
5404               if (3 * i + nelt2 < nelt)
5405                 sel[3 * i + nelt2] = 0;
5406             }
5407           indices.new_vector (sel, 2, nelt);
5408           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5409
5410           for (i = 0; i < nelt; i++)
5411             {
5412               if (3 * i + nelt0 < nelt)
5413                 sel[3 * i + nelt0] = 3 * i + nelt0;
5414               if (3 * i + nelt1 < nelt)
5415                 sel[3 * i + nelt1] = 3 * i + nelt1;
5416               if (3 * i + nelt2 < nelt)
5417                 sel[3 * i + nelt2] = nelt + j2++;
5418             }
5419           indices.new_vector (sel, 2, nelt);
5420           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5421
5422           vect1 = dr_chain[0];
5423           vect2 = dr_chain[1];
5424
5425           /* Create interleaving stmt:
5426              low = VEC_PERM_EXPR <vect1, vect2,
5427                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5428                                    j + 2, nelt + j + 2, *, ...}>  */
5429           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5430           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5431                                            vect2, perm3_mask_low);
5432           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5433
5434           vect1 = data_ref;
5435           vect2 = dr_chain[2];
5436           /* Create interleaving stmt:
5437              low = VEC_PERM_EXPR <vect1, vect2,
5438                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5439                                    6, 7, nelt + j + 2, ...}>  */
5440           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5441           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5442                                            vect2, perm3_mask_high);
5443           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5444           (*result_chain)[j] = data_ref;
5445         }
5446     }
5447   else
5448     {
5449       /* If length is not equal to 3 then only power of 2 is supported.  */
5450       gcc_assert (pow2p_hwi (length));
5451
5452       /* The encoding has 2 interleaved stepped patterns.  */
5453       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5454       vec_perm_builder sel (nelt, 2, 3);
5455       sel.quick_grow (6);
5456       for (i = 0; i < 3; i++)
5457         {
5458           sel[i * 2] = i;
5459           sel[i * 2 + 1] = i + nelt;
5460         }
5461         vec_perm_indices indices (sel, 2, nelt);
5462         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5463
5464         for (i = 0; i < 6; i++)
5465           sel[i] += exact_div (nelt, 2);
5466         indices.new_vector (sel, 2, nelt);
5467         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5468
5469         for (i = 0, n = log_length; i < n; i++)
5470           {
5471             for (j = 0; j < length/2; j++)
5472               {
5473                 vect1 = dr_chain[j];
5474                 vect2 = dr_chain[j+length/2];
5475
5476                 /* Create interleaving stmt:
5477                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5478                                                         ...}>  */
5479                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5480                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5481                                                  vect2, perm_mask_high);
5482                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5483                 (*result_chain)[2*j] = high;
5484
5485                 /* Create interleaving stmt:
5486                    low = VEC_PERM_EXPR <vect1, vect2,
5487                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5488                                          ...}>  */
5489                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5490                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5491                                                  vect2, perm_mask_low);
5492                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5493                 (*result_chain)[2*j+1] = low;
5494               }
5495             memcpy (dr_chain.address (), result_chain->address (),
5496                     length * sizeof (tree));
5497           }
5498     }
5499 }
5500
5501 /* Function vect_setup_realignment
5502
5503    This function is called when vectorizing an unaligned load using
5504    the dr_explicit_realign[_optimized] scheme.
5505    This function generates the following code at the loop prolog:
5506
5507       p = initial_addr;
5508    x  msq_init = *(floor(p));   # prolog load
5509       realignment_token = call target_builtin;
5510     loop:
5511    x  msq = phi (msq_init, ---)
5512
5513    The stmts marked with x are generated only for the case of
5514    dr_explicit_realign_optimized.
5515
5516    The code above sets up a new (vector) pointer, pointing to the first
5517    location accessed by STMT_INFO, and a "floor-aligned" load using that
5518    pointer.  It also generates code to compute the "realignment-token"
5519    (if the relevant target hook was defined), and creates a phi-node at the
5520    loop-header bb whose arguments are the result of the prolog-load (created
5521    by this function) and the result of a load that takes place in the loop
5522    (to be created by the caller to this function).
5523
5524    For the case of dr_explicit_realign_optimized:
5525    The caller to this function uses the phi-result (msq) to create the
5526    realignment code inside the loop, and sets up the missing phi argument,
5527    as follows:
5528     loop:
5529       msq = phi (msq_init, lsq)
5530       lsq = *(floor(p'));        # load in loop
5531       result = realign_load (msq, lsq, realignment_token);
5532
5533    For the case of dr_explicit_realign:
5534     loop:
5535       msq = *(floor(p));        # load in loop
5536       p' = p + (VS-1);
5537       lsq = *(floor(p'));       # load in loop
5538       result = realign_load (msq, lsq, realignment_token);
5539
5540    Input:
5541    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5542                a memory location that may be unaligned.
5543    BSI - place where new code is to be inserted.
5544    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5545                               is used.
5546
5547    Output:
5548    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5549                        target hook, if defined.
5550    Return value - the result of the loop-header phi node.  */
5551
5552 tree
5553 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5554                         gimple_stmt_iterator *gsi, tree *realignment_token,
5555                         enum dr_alignment_support alignment_support_scheme,
5556                         tree init_addr,
5557                         class loop **at_loop)
5558 {
5559   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5560   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5561   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5562   struct data_reference *dr = dr_info->dr;
5563   class loop *loop = NULL;
5564   edge pe = NULL;
5565   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5566   tree vec_dest;
5567   gimple *inc;
5568   tree ptr;
5569   tree data_ref;
5570   basic_block new_bb;
5571   tree msq_init = NULL_TREE;
5572   tree new_temp;
5573   gphi *phi_stmt;
5574   tree msq = NULL_TREE;
5575   gimple_seq stmts = NULL;
5576   bool compute_in_loop = false;
5577   bool nested_in_vect_loop = false;
5578   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5579   class loop *loop_for_initial_load = NULL;
5580
5581   if (loop_vinfo)
5582     {
5583       loop = LOOP_VINFO_LOOP (loop_vinfo);
5584       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5585     }
5586
5587   gcc_assert (alignment_support_scheme == dr_explicit_realign
5588               || alignment_support_scheme == dr_explicit_realign_optimized);
5589
5590   /* We need to generate three things:
5591      1. the misalignment computation
5592      2. the extra vector load (for the optimized realignment scheme).
5593      3. the phi node for the two vectors from which the realignment is
5594       done (for the optimized realignment scheme).  */
5595
5596   /* 1. Determine where to generate the misalignment computation.
5597
5598      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5599      calculation will be generated by this function, outside the loop (in the
5600      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5601      caller, inside the loop.
5602
5603      Background: If the misalignment remains fixed throughout the iterations of
5604      the loop, then both realignment schemes are applicable, and also the
5605      misalignment computation can be done outside LOOP.  This is because we are
5606      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5607      are a multiple of VS (the Vector Size), and therefore the misalignment in
5608      different vectorized LOOP iterations is always the same.
5609      The problem arises only if the memory access is in an inner-loop nested
5610      inside LOOP, which is now being vectorized using outer-loop vectorization.
5611      This is the only case when the misalignment of the memory access may not
5612      remain fixed throughout the iterations of the inner-loop (as explained in
5613      detail in vect_supportable_dr_alignment).  In this case, not only is the
5614      optimized realignment scheme not applicable, but also the misalignment
5615      computation (and generation of the realignment token that is passed to
5616      REALIGN_LOAD) have to be done inside the loop.
5617
5618      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5619      or not, which in turn determines if the misalignment is computed inside
5620      the inner-loop, or outside LOOP.  */
5621
5622   if (init_addr != NULL_TREE || !loop_vinfo)
5623     {
5624       compute_in_loop = true;
5625       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5626     }
5627
5628
5629   /* 2. Determine where to generate the extra vector load.
5630
5631      For the optimized realignment scheme, instead of generating two vector
5632      loads in each iteration, we generate a single extra vector load in the
5633      preheader of the loop, and in each iteration reuse the result of the
5634      vector load from the previous iteration.  In case the memory access is in
5635      an inner-loop nested inside LOOP, which is now being vectorized using
5636      outer-loop vectorization, we need to determine whether this initial vector
5637      load should be generated at the preheader of the inner-loop, or can be
5638      generated at the preheader of LOOP.  If the memory access has no evolution
5639      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5640      to be generated inside LOOP (in the preheader of the inner-loop).  */
5641
5642   if (nested_in_vect_loop)
5643     {
5644       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5645       bool invariant_in_outerloop =
5646             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5647       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5648     }
5649   else
5650     loop_for_initial_load = loop;
5651   if (at_loop)
5652     *at_loop = loop_for_initial_load;
5653
5654   if (loop_for_initial_load)
5655     pe = loop_preheader_edge (loop_for_initial_load);
5656
5657   /* 3. For the case of the optimized realignment, create the first vector
5658       load at the loop preheader.  */
5659
5660   if (alignment_support_scheme == dr_explicit_realign_optimized)
5661     {
5662       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5663       gassign *new_stmt;
5664
5665       gcc_assert (!compute_in_loop);
5666       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5667       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5668                                       loop_for_initial_load, NULL_TREE,
5669                                       &init_addr, NULL, &inc, true);
5670       if (TREE_CODE (ptr) == SSA_NAME)
5671         new_temp = copy_ssa_name (ptr);
5672       else
5673         new_temp = make_ssa_name (TREE_TYPE (ptr));
5674       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5675       tree type = TREE_TYPE (ptr);
5676       new_stmt = gimple_build_assign
5677                    (new_temp, BIT_AND_EXPR, ptr,
5678                     fold_build2 (MINUS_EXPR, type,
5679                                  build_int_cst (type, 0),
5680                                  build_int_cst (type, align)));
5681       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5682       gcc_assert (!new_bb);
5683       data_ref
5684         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5685                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5686       vect_copy_ref_info (data_ref, DR_REF (dr));
5687       new_stmt = gimple_build_assign (vec_dest, data_ref);
5688       new_temp = make_ssa_name (vec_dest, new_stmt);
5689       gimple_assign_set_lhs (new_stmt, new_temp);
5690       if (pe)
5691         {
5692           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5693           gcc_assert (!new_bb);
5694         }
5695       else
5696          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5697
5698       msq_init = gimple_assign_lhs (new_stmt);
5699     }
5700
5701   /* 4. Create realignment token using a target builtin, if available.
5702       It is done either inside the containing loop, or before LOOP (as
5703       determined above).  */
5704
5705   if (targetm.vectorize.builtin_mask_for_load)
5706     {
5707       gcall *new_stmt;
5708       tree builtin_decl;
5709
5710       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5711       if (!init_addr)
5712         {
5713           /* Generate the INIT_ADDR computation outside LOOP.  */
5714           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5715                                                             stmt_info, &stmts,
5716                                                             NULL_TREE);
5717           if (loop)
5718             {
5719               pe = loop_preheader_edge (loop);
5720               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5721               gcc_assert (!new_bb);
5722             }
5723           else
5724              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5725         }
5726
5727       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5728       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5729       vec_dest =
5730         vect_create_destination_var (scalar_dest,
5731                                      gimple_call_return_type (new_stmt));
5732       new_temp = make_ssa_name (vec_dest, new_stmt);
5733       gimple_call_set_lhs (new_stmt, new_temp);
5734
5735       if (compute_in_loop)
5736         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5737       else
5738         {
5739           /* Generate the misalignment computation outside LOOP.  */
5740           pe = loop_preheader_edge (loop);
5741           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5742           gcc_assert (!new_bb);
5743         }
5744
5745       *realignment_token = gimple_call_lhs (new_stmt);
5746
5747       /* The result of the CALL_EXPR to this builtin is determined from
5748          the value of the parameter and no global variables are touched
5749          which makes the builtin a "const" function.  Requiring the
5750          builtin to have the "const" attribute makes it unnecessary
5751          to call mark_call_clobbered.  */
5752       gcc_assert (TREE_READONLY (builtin_decl));
5753     }
5754
5755   if (alignment_support_scheme == dr_explicit_realign)
5756     return msq;
5757
5758   gcc_assert (!compute_in_loop);
5759   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5760
5761
5762   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5763
5764   pe = loop_preheader_edge (containing_loop);
5765   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5766   msq = make_ssa_name (vec_dest);
5767   phi_stmt = create_phi_node (msq, containing_loop->header);
5768   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5769
5770   return msq;
5771 }
5772
5773
5774 /* Function vect_grouped_load_supported.
5775
5776    COUNT is the size of the load group (the number of statements plus the
5777    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5778    only one statement, with a gap of COUNT - 1.
5779
5780    Returns true if a suitable permute exists.  */
5781
5782 bool
5783 vect_grouped_load_supported (tree vectype, bool single_element_p,
5784                              unsigned HOST_WIDE_INT count)
5785 {
5786   machine_mode mode = TYPE_MODE (vectype);
5787
5788   /* If this is single-element interleaving with an element distance
5789      that leaves unused vector loads around punt - we at least create
5790      very sub-optimal code in that case (and blow up memory,
5791      see PR65518).  */
5792   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5793     {
5794       if (dump_enabled_p ())
5795         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5796                          "single-element interleaving not supported "
5797                          "for not adjacent vector loads\n");
5798       return false;
5799     }
5800
5801   /* vect_permute_load_chain requires the group size to be equal to 3 or
5802      be a power of two.  */
5803   if (count != 3 && exact_log2 (count) == -1)
5804     {
5805       if (dump_enabled_p ())
5806         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5807                          "the size of the group of accesses"
5808                          " is not a power of 2 or not equal to 3\n");
5809       return false;
5810     }
5811
5812   /* Check that the permutation is supported.  */
5813   if (VECTOR_MODE_P (mode))
5814     {
5815       unsigned int i, j;
5816       if (count == 3)
5817         {
5818           unsigned int nelt;
5819           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5820             {
5821               if (dump_enabled_p ())
5822                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5823                                  "cannot handle groups of 3 loads for"
5824                                  " variable-length vectors\n");
5825               return false;
5826             }
5827
5828           vec_perm_builder sel (nelt, nelt, 1);
5829           sel.quick_grow (nelt);
5830           vec_perm_indices indices;
5831           unsigned int k;
5832           for (k = 0; k < 3; k++)
5833             {
5834               for (i = 0; i < nelt; i++)
5835                 if (3 * i + k < 2 * nelt)
5836                   sel[i] = 3 * i + k;
5837                 else
5838                   sel[i] = 0;
5839               indices.new_vector (sel, 2, nelt);
5840               if (!can_vec_perm_const_p (mode, indices))
5841                 {
5842                   if (dump_enabled_p ())
5843                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5844                                      "shuffle of 3 loads is not supported by"
5845                                      " target\n");
5846                   return false;
5847                 }
5848               for (i = 0, j = 0; i < nelt; i++)
5849                 if (3 * i + k < 2 * nelt)
5850                   sel[i] = i;
5851                 else
5852                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5853               indices.new_vector (sel, 2, nelt);
5854               if (!can_vec_perm_const_p (mode, indices))
5855                 {
5856                   if (dump_enabled_p ())
5857                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5858                                      "shuffle of 3 loads is not supported by"
5859                                      " target\n");
5860                   return false;
5861                 }
5862             }
5863           return true;
5864         }
5865       else
5866         {
5867           /* If length is not equal to 3 then only power of 2 is supported.  */
5868           gcc_assert (pow2p_hwi (count));
5869           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5870
5871           /* The encoding has a single stepped pattern.  */
5872           vec_perm_builder sel (nelt, 1, 3);
5873           sel.quick_grow (3);
5874           for (i = 0; i < 3; i++)
5875             sel[i] = i * 2;
5876           vec_perm_indices indices (sel, 2, nelt);
5877           if (can_vec_perm_const_p (mode, indices))
5878             {
5879               for (i = 0; i < 3; i++)
5880                 sel[i] = i * 2 + 1;
5881               indices.new_vector (sel, 2, nelt);
5882               if (can_vec_perm_const_p (mode, indices))
5883                 return true;
5884             }
5885         }
5886     }
5887
5888   if (dump_enabled_p ())
5889     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5890                      "extract even/odd not supported by target\n");
5891   return false;
5892 }
5893
5894 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5895    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5896
5897 bool
5898 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5899                            bool masked_p)
5900 {
5901   if (masked_p)
5902     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5903                                          vec_mask_load_lanes_optab,
5904                                          vectype, count);
5905   else
5906     return vect_lanes_optab_supported_p ("vec_load_lanes",
5907                                          vec_load_lanes_optab,
5908                                          vectype, count);
5909 }
5910
5911 /* Function vect_permute_load_chain.
5912
5913    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5914    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5915    the input data correctly.  Return the final references for loads in
5916    RESULT_CHAIN.
5917
5918    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5919    The input is 4 vectors each containing 8 elements. We assign a number to each
5920    element, the input sequence is:
5921
5922    1st vec:   0  1  2  3  4  5  6  7
5923    2nd vec:   8  9 10 11 12 13 14 15
5924    3rd vec:  16 17 18 19 20 21 22 23
5925    4th vec:  24 25 26 27 28 29 30 31
5926
5927    The output sequence should be:
5928
5929    1st vec:  0 4  8 12 16 20 24 28
5930    2nd vec:  1 5  9 13 17 21 25 29
5931    3rd vec:  2 6 10 14 18 22 26 30
5932    4th vec:  3 7 11 15 19 23 27 31
5933
5934    i.e., the first output vector should contain the first elements of each
5935    interleaving group, etc.
5936
5937    We use extract_even/odd instructions to create such output.  The input of
5938    each extract_even/odd operation is two vectors
5939    1st vec    2nd vec
5940    0 1 2 3    4 5 6 7
5941
5942    and the output is the vector of extracted even/odd elements.  The output of
5943    extract_even will be:   0 2 4 6
5944    and of extract_odd:     1 3 5 7
5945
5946
5947    The permutation is done in log LENGTH stages.  In each stage extract_even
5948    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5949    their order.  In our example,
5950
5951    E1: extract_even (1st vec, 2nd vec)
5952    E2: extract_odd (1st vec, 2nd vec)
5953    E3: extract_even (3rd vec, 4th vec)
5954    E4: extract_odd (3rd vec, 4th vec)
5955
5956    The output for the first stage will be:
5957
5958    E1:  0  2  4  6  8 10 12 14
5959    E2:  1  3  5  7  9 11 13 15
5960    E3: 16 18 20 22 24 26 28 30
5961    E4: 17 19 21 23 25 27 29 31
5962
5963    In order to proceed and create the correct sequence for the next stage (or
5964    for the correct output, if the second stage is the last one, as in our
5965    example), we first put the output of extract_even operation and then the
5966    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5967    The input for the second stage is:
5968
5969    1st vec (E1):  0  2  4  6  8 10 12 14
5970    2nd vec (E3): 16 18 20 22 24 26 28 30
5971    3rd vec (E2):  1  3  5  7  9 11 13 15
5972    4th vec (E4): 17 19 21 23 25 27 29 31
5973
5974    The output of the second stage:
5975
5976    E1: 0 4  8 12 16 20 24 28
5977    E2: 2 6 10 14 18 22 26 30
5978    E3: 1 5  9 13 17 21 25 29
5979    E4: 3 7 11 15 19 23 27 31
5980
5981    And RESULT_CHAIN after reordering:
5982
5983    1st vec (E1):  0 4  8 12 16 20 24 28
5984    2nd vec (E3):  1 5  9 13 17 21 25 29
5985    3rd vec (E2):  2 6 10 14 18 22 26 30
5986    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5987
5988 static void
5989 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5990                          unsigned int length,
5991                          stmt_vec_info stmt_info,
5992                          gimple_stmt_iterator *gsi,
5993                          vec<tree> *result_chain)
5994 {
5995   tree data_ref, first_vect, second_vect;
5996   tree perm_mask_even, perm_mask_odd;
5997   tree perm3_mask_low, perm3_mask_high;
5998   gimple *perm_stmt;
5999   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6000   unsigned int i, j, log_length = exact_log2 (length);
6001
6002   result_chain->quick_grow (length);
6003   memcpy (result_chain->address (), dr_chain.address (),
6004           length * sizeof (tree));
6005
6006   if (length == 3)
6007     {
6008       /* vect_grouped_load_supported ensures that this is constant.  */
6009       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6010       unsigned int k;
6011
6012       vec_perm_builder sel (nelt, nelt, 1);
6013       sel.quick_grow (nelt);
6014       vec_perm_indices indices;
6015       for (k = 0; k < 3; k++)
6016         {
6017           for (i = 0; i < nelt; i++)
6018             if (3 * i + k < 2 * nelt)
6019               sel[i] = 3 * i + k;
6020             else
6021               sel[i] = 0;
6022           indices.new_vector (sel, 2, nelt);
6023           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6024
6025           for (i = 0, j = 0; i < nelt; i++)
6026             if (3 * i + k < 2 * nelt)
6027               sel[i] = i;
6028             else
6029               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6030           indices.new_vector (sel, 2, nelt);
6031           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6032
6033           first_vect = dr_chain[0];
6034           second_vect = dr_chain[1];
6035
6036           /* Create interleaving stmt (low part of):
6037              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6038                                                              ...}>  */
6039           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6040           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6041                                            second_vect, perm3_mask_low);
6042           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6043
6044           /* Create interleaving stmt (high part of):
6045              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6046                                                               ...}>  */
6047           first_vect = data_ref;
6048           second_vect = dr_chain[2];
6049           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6050           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6051                                            second_vect, perm3_mask_high);
6052           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6053           (*result_chain)[k] = data_ref;
6054         }
6055     }
6056   else
6057     {
6058       /* If length is not equal to 3 then only power of 2 is supported.  */
6059       gcc_assert (pow2p_hwi (length));
6060
6061       /* The encoding has a single stepped pattern.  */
6062       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6063       vec_perm_builder sel (nelt, 1, 3);
6064       sel.quick_grow (3);
6065       for (i = 0; i < 3; ++i)
6066         sel[i] = i * 2;
6067       vec_perm_indices indices (sel, 2, nelt);
6068       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6069
6070       for (i = 0; i < 3; ++i)
6071         sel[i] = i * 2 + 1;
6072       indices.new_vector (sel, 2, nelt);
6073       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6074
6075       for (i = 0; i < log_length; i++)
6076         {
6077           for (j = 0; j < length; j += 2)
6078             {
6079               first_vect = dr_chain[j];
6080               second_vect = dr_chain[j+1];
6081
6082               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6083               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6084               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6085                                                first_vect, second_vect,
6086                                                perm_mask_even);
6087               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6088               (*result_chain)[j/2] = data_ref;
6089
6090               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6091               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6092               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6093                                                first_vect, second_vect,
6094                                                perm_mask_odd);
6095               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6096               (*result_chain)[j/2+length/2] = data_ref;
6097             }
6098           memcpy (dr_chain.address (), result_chain->address (),
6099                   length * sizeof (tree));
6100         }
6101     }
6102 }
6103
6104 /* Function vect_shift_permute_load_chain.
6105
6106    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6107    sequence of stmts to reorder the input data accordingly.
6108    Return the final references for loads in RESULT_CHAIN.
6109    Return true if successed, false otherwise.
6110
6111    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6112    The input is 3 vectors each containing 8 elements.  We assign a
6113    number to each element, the input sequence is:
6114
6115    1st vec:   0  1  2  3  4  5  6  7
6116    2nd vec:   8  9 10 11 12 13 14 15
6117    3rd vec:  16 17 18 19 20 21 22 23
6118
6119    The output sequence should be:
6120
6121    1st vec:  0 3 6  9 12 15 18 21
6122    2nd vec:  1 4 7 10 13 16 19 22
6123    3rd vec:  2 5 8 11 14 17 20 23
6124
6125    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6126
6127    First we shuffle all 3 vectors to get correct elements order:
6128
6129    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6130    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6131    3rd vec:  (16 19 22) (17 20 23) (18 21)
6132
6133    Next we unite and shift vector 3 times:
6134
6135    1st step:
6136      shift right by 6 the concatenation of:
6137      "1st vec" and  "2nd vec"
6138        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6139      "2nd vec" and  "3rd vec"
6140        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6141      "3rd vec" and  "1st vec"
6142        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6143                              | New vectors                   |
6144
6145      So that now new vectors are:
6146
6147      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6148      2nd vec:  (10 13) (16 19 22) (17 20 23)
6149      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6150
6151    2nd step:
6152      shift right by 5 the concatenation of:
6153      "1st vec" and  "3rd vec"
6154        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6155      "2nd vec" and  "1st vec"
6156        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6157      "3rd vec" and  "2nd vec"
6158        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6159                           | New vectors                   |
6160
6161      So that now new vectors are:
6162
6163      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6164      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6165      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6166
6167    3rd step:
6168      shift right by 5 the concatenation of:
6169      "1st vec" and  "1st vec"
6170        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6171      shift right by 3 the concatenation of:
6172      "2nd vec" and  "2nd vec"
6173                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6174                           | New vectors                   |
6175
6176      So that now all vectors are READY:
6177      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6178      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6179      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6180
6181    This algorithm is faster than one in vect_permute_load_chain if:
6182      1.  "shift of a concatination" is faster than general permutation.
6183          This is usually so.
6184      2.  The TARGET machine can't execute vector instructions in parallel.
6185          This is because each step of the algorithm depends on previous.
6186          The algorithm in vect_permute_load_chain is much more parallel.
6187
6188    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6189 */
6190
6191 static bool
6192 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6193                                unsigned int length,
6194                                stmt_vec_info stmt_info,
6195                                gimple_stmt_iterator *gsi,
6196                                vec<tree> *result_chain)
6197 {
6198   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6199   tree perm2_mask1, perm2_mask2, perm3_mask;
6200   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6201   gimple *perm_stmt;
6202
6203   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6204   unsigned int i;
6205   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6206
6207   unsigned HOST_WIDE_INT nelt, vf;
6208   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6209       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6210     /* Not supported for variable-length vectors.  */
6211     return false;
6212
6213   vec_perm_builder sel (nelt, nelt, 1);
6214   sel.quick_grow (nelt);
6215
6216   result_chain->quick_grow (length);
6217   memcpy (result_chain->address (), dr_chain.address (),
6218           length * sizeof (tree));
6219
6220   if (pow2p_hwi (length) && vf > 4)
6221     {
6222       unsigned int j, log_length = exact_log2 (length);
6223       for (i = 0; i < nelt / 2; ++i)
6224         sel[i] = i * 2;
6225       for (i = 0; i < nelt / 2; ++i)
6226         sel[nelt / 2 + i] = i * 2 + 1;
6227       vec_perm_indices indices (sel, 2, nelt);
6228       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6229         {
6230           if (dump_enabled_p ())
6231             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6232                              "shuffle of 2 fields structure is not \
6233                               supported by target\n");
6234           return false;
6235         }
6236       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6237
6238       for (i = 0; i < nelt / 2; ++i)
6239         sel[i] = i * 2 + 1;
6240       for (i = 0; i < nelt / 2; ++i)
6241         sel[nelt / 2 + i] = i * 2;
6242       indices.new_vector (sel, 2, nelt);
6243       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6244         {
6245           if (dump_enabled_p ())
6246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6247                              "shuffle of 2 fields structure is not \
6248                               supported by target\n");
6249           return false;
6250         }
6251       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6252
6253       /* Generating permutation constant to shift all elements.
6254          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6255       for (i = 0; i < nelt; i++)
6256         sel[i] = nelt / 2 + i;
6257       indices.new_vector (sel, 2, nelt);
6258       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6259         {
6260           if (dump_enabled_p ())
6261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6262                              "shift permutation is not supported by target\n");
6263           return false;
6264         }
6265       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6266
6267       /* Generating permutation constant to select vector from 2.
6268          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6269       for (i = 0; i < nelt / 2; i++)
6270         sel[i] = i;
6271       for (i = nelt / 2; i < nelt; i++)
6272         sel[i] = nelt + i;
6273       indices.new_vector (sel, 2, nelt);
6274       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6275         {
6276           if (dump_enabled_p ())
6277             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6278                              "select is not supported by target\n");
6279           return false;
6280         }
6281       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6282
6283       for (i = 0; i < log_length; i++)
6284         {
6285           for (j = 0; j < length; j += 2)
6286             {
6287               first_vect = dr_chain[j];
6288               second_vect = dr_chain[j + 1];
6289
6290               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6291               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6292                                                first_vect, first_vect,
6293                                                perm2_mask1);
6294               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6295               vect[0] = data_ref;
6296
6297               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6298               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6299                                                second_vect, second_vect,
6300                                                perm2_mask2);
6301               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6302               vect[1] = data_ref;
6303
6304               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6305               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6306                                                vect[0], vect[1], shift1_mask);
6307               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6308               (*result_chain)[j/2 + length/2] = data_ref;
6309
6310               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6311               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6312                                                vect[0], vect[1], select_mask);
6313               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6314               (*result_chain)[j/2] = data_ref;
6315             }
6316           memcpy (dr_chain.address (), result_chain->address (),
6317                   length * sizeof (tree));
6318         }
6319       return true;
6320     }
6321   if (length == 3 && vf > 2)
6322     {
6323       unsigned int k = 0, l = 0;
6324
6325       /* Generating permutation constant to get all elements in rigth order.
6326          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6327       for (i = 0; i < nelt; i++)
6328         {
6329           if (3 * k + (l % 3) >= nelt)
6330             {
6331               k = 0;
6332               l += (3 - (nelt % 3));
6333             }
6334           sel[i] = 3 * k + (l % 3);
6335           k++;
6336         }
6337       vec_perm_indices indices (sel, 2, nelt);
6338       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6339         {
6340           if (dump_enabled_p ())
6341             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6342                              "shuffle of 3 fields structure is not \
6343                               supported by target\n");
6344           return false;
6345         }
6346       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6347
6348       /* Generating permutation constant to shift all elements.
6349          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6350       for (i = 0; i < nelt; i++)
6351         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6352       indices.new_vector (sel, 2, nelt);
6353       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6354         {
6355           if (dump_enabled_p ())
6356             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357                              "shift permutation is not supported by target\n");
6358           return false;
6359         }
6360       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6361
6362       /* Generating permutation constant to shift all elements.
6363          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6364       for (i = 0; i < nelt; i++)
6365         sel[i] = 2 * (nelt / 3) + 1 + i;
6366       indices.new_vector (sel, 2, nelt);
6367       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6368         {
6369           if (dump_enabled_p ())
6370             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6371                              "shift permutation is not supported by target\n");
6372           return false;
6373         }
6374       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6375
6376       /* Generating permutation constant to shift all elements.
6377          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6378       for (i = 0; i < nelt; i++)
6379         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6380       indices.new_vector (sel, 2, nelt);
6381       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6382         {
6383           if (dump_enabled_p ())
6384             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6385                              "shift permutation is not supported by target\n");
6386           return false;
6387         }
6388       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6389
6390       /* Generating permutation constant to shift all elements.
6391          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6392       for (i = 0; i < nelt; i++)
6393         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6394       indices.new_vector (sel, 2, nelt);
6395       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6396         {
6397           if (dump_enabled_p ())
6398             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6399                              "shift permutation is not supported by target\n");
6400           return false;
6401         }
6402       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6403
6404       for (k = 0; k < 3; k++)
6405         {
6406           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6407           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6408                                            dr_chain[k], dr_chain[k],
6409                                            perm3_mask);
6410           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6411           vect[k] = data_ref;
6412         }
6413
6414       for (k = 0; k < 3; k++)
6415         {
6416           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6417           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6418                                            vect[k % 3], vect[(k + 1) % 3],
6419                                            shift1_mask);
6420           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6421           vect_shift[k] = data_ref;
6422         }
6423
6424       for (k = 0; k < 3; k++)
6425         {
6426           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6427           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6428                                            vect_shift[(4 - k) % 3],
6429                                            vect_shift[(3 - k) % 3],
6430                                            shift2_mask);
6431           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6432           vect[k] = data_ref;
6433         }
6434
6435       (*result_chain)[3 - (nelt % 3)] = vect[2];
6436
6437       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6438       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6439                                        vect[0], shift3_mask);
6440       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6441       (*result_chain)[nelt % 3] = data_ref;
6442
6443       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6444       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6445                                        vect[1], shift4_mask);
6446       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6447       (*result_chain)[0] = data_ref;
6448       return true;
6449     }
6450   return false;
6451 }
6452
6453 /* Function vect_transform_grouped_load.
6454
6455    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6456    to perform their permutation and ascribe the result vectorized statements to
6457    the scalar statements.
6458 */
6459
6460 void
6461 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6462                              vec<tree> dr_chain,
6463                              int size, gimple_stmt_iterator *gsi)
6464 {
6465   machine_mode mode;
6466   vec<tree> result_chain = vNULL;
6467
6468   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6469      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6470      vectors, that are ready for vector computation.  */
6471   result_chain.create (size);
6472
6473   /* If reassociation width for vector type is 2 or greater target machine can
6474      execute 2 or more vector instructions in parallel.  Otherwise try to
6475      get chain for loads group using vect_shift_permute_load_chain.  */
6476   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6477   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6478       || pow2p_hwi (size)
6479       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6480                                          gsi, &result_chain))
6481     vect_permute_load_chain (vinfo, dr_chain,
6482                              size, stmt_info, gsi, &result_chain);
6483   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6484   result_chain.release ();
6485 }
6486
6487 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6488    generated as part of the vectorization of STMT_INFO.  Assign the statement
6489    for each vector to the associated scalar statement.  */
6490
6491 void
6492 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6493                                   vec<tree> result_chain)
6494 {
6495   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6496   unsigned int i, gap_count;
6497   tree tmp_data_ref;
6498
6499   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6500      Since we scan the chain starting from it's first node, their order
6501      corresponds the order of data-refs in RESULT_CHAIN.  */
6502   stmt_vec_info next_stmt_info = first_stmt_info;
6503   gap_count = 1;
6504   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6505     {
6506       if (!next_stmt_info)
6507         break;
6508
6509       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6510        code elimination pass later.  No need to check for the first stmt in
6511        the group, since it always exists.
6512        DR_GROUP_GAP is the number of steps in elements from the previous
6513        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6514        correspond to the gaps.  */
6515       if (next_stmt_info != first_stmt_info
6516           && gap_count < DR_GROUP_GAP (next_stmt_info))
6517         {
6518           gap_count++;
6519           continue;
6520         }
6521
6522       /* ???  The following needs cleanup after the removal of
6523          DR_GROUP_SAME_DR_STMT.  */
6524       if (next_stmt_info)
6525         {
6526           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6527           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6528              copies, and we put the new vector statement last.  */
6529           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6530
6531           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6532           gap_count = 1;
6533         }
6534     }
6535 }
6536
6537 /* Function vect_force_dr_alignment_p.
6538
6539    Returns whether the alignment of a DECL can be forced to be aligned
6540    on ALIGNMENT bit boundary.  */
6541
6542 bool
6543 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6544 {
6545   if (!VAR_P (decl))
6546     return false;
6547
6548   if (decl_in_symtab_p (decl)
6549       && !symtab_node::get (decl)->can_increase_alignment_p ())
6550     return false;
6551
6552   if (TREE_STATIC (decl))
6553     return (known_le (alignment,
6554                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6555   else
6556     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6557 }
6558
6559
6560 /* Return whether the data reference DR_INFO is supported with respect to its
6561    alignment.
6562    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6563    it is aligned, i.e., check if it is possible to vectorize it with different
6564    alignment.  */
6565
6566 enum dr_alignment_support
6567 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6568                                bool check_aligned_accesses)
6569 {
6570   data_reference *dr = dr_info->dr;
6571   stmt_vec_info stmt_info = dr_info->stmt;
6572   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6573   machine_mode mode = TYPE_MODE (vectype);
6574   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6575   class loop *vect_loop = NULL;
6576   bool nested_in_vect_loop = false;
6577
6578   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6579     return dr_aligned;
6580
6581   /* For now assume all conditional loads/stores support unaligned
6582      access without any special code.  */
6583   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6584     if (gimple_call_internal_p (stmt)
6585         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6586             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6587       return dr_unaligned_supported;
6588
6589   if (loop_vinfo)
6590     {
6591       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6592       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6593     }
6594
6595   /* Possibly unaligned access.  */
6596
6597   /* We can choose between using the implicit realignment scheme (generating
6598      a misaligned_move stmt) and the explicit realignment scheme (generating
6599      aligned loads with a REALIGN_LOAD).  There are two variants to the
6600      explicit realignment scheme: optimized, and unoptimized.
6601      We can optimize the realignment only if the step between consecutive
6602      vector loads is equal to the vector size.  Since the vector memory
6603      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6604      is guaranteed that the misalignment amount remains the same throughout the
6605      execution of the vectorized loop.  Therefore, we can create the
6606      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6607      at the loop preheader.
6608
6609      However, in the case of outer-loop vectorization, when vectorizing a
6610      memory access in the inner-loop nested within the LOOP that is now being
6611      vectorized, while it is guaranteed that the misalignment of the
6612      vectorized memory access will remain the same in different outer-loop
6613      iterations, it is *not* guaranteed that is will remain the same throughout
6614      the execution of the inner-loop.  This is because the inner-loop advances
6615      with the original scalar step (and not in steps of VS).  If the inner-loop
6616      step happens to be a multiple of VS, then the misalignment remains fixed
6617      and we can use the optimized realignment scheme.  For example:
6618
6619       for (i=0; i<N; i++)
6620         for (j=0; j<M; j++)
6621           s += a[i+j];
6622
6623      When vectorizing the i-loop in the above example, the step between
6624      consecutive vector loads is 1, and so the misalignment does not remain
6625      fixed across the execution of the inner-loop, and the realignment cannot
6626      be optimized (as illustrated in the following pseudo vectorized loop):
6627
6628       for (i=0; i<N; i+=4)
6629         for (j=0; j<M; j++){
6630           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6631                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6632                          // (assuming that we start from an aligned address).
6633           }
6634
6635      We therefore have to use the unoptimized realignment scheme:
6636
6637       for (i=0; i<N; i+=4)
6638           for (j=k; j<M; j+=4)
6639           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6640                            // that the misalignment of the initial address is
6641                            // 0).
6642
6643      The loop can then be vectorized as follows:
6644
6645       for (k=0; k<4; k++){
6646         rt = get_realignment_token (&vp[k]);
6647         for (i=0; i<N; i+=4){
6648           v1 = vp[i+k];
6649           for (j=k; j<M; j+=4){
6650             v2 = vp[i+j+VS-1];
6651             va = REALIGN_LOAD <v1,v2,rt>;
6652             vs += va;
6653             v1 = v2;
6654           }
6655         }
6656     } */
6657
6658   if (DR_IS_READ (dr))
6659     {
6660       bool is_packed = false;
6661       tree type = (TREE_TYPE (DR_REF (dr)));
6662
6663       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6664           && (!targetm.vectorize.builtin_mask_for_load
6665               || targetm.vectorize.builtin_mask_for_load ()))
6666         {
6667           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6668
6669           /* If we are doing SLP then the accesses need not have the
6670              same alignment, instead it depends on the SLP group size.  */
6671           if (loop_vinfo
6672               && STMT_SLP_TYPE (stmt_info)
6673               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6674                               * (DR_GROUP_SIZE
6675                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6676                               TYPE_VECTOR_SUBPARTS (vectype)))
6677             ;
6678           else if (!loop_vinfo
6679                    || (nested_in_vect_loop
6680                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6681                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6682             return dr_explicit_realign;
6683           else
6684             return dr_explicit_realign_optimized;
6685         }
6686       if (!known_alignment_for_access_p (dr_info))
6687         is_packed = not_size_aligned (DR_REF (dr));
6688
6689       if (targetm.vectorize.support_vector_misalignment
6690             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6691         /* Can't software pipeline the loads, but can at least do them.  */
6692         return dr_unaligned_supported;
6693     }
6694   else
6695     {
6696       bool is_packed = false;
6697       tree type = (TREE_TYPE (DR_REF (dr)));
6698
6699       if (!known_alignment_for_access_p (dr_info))
6700         is_packed = not_size_aligned (DR_REF (dr));
6701
6702      if (targetm.vectorize.support_vector_misalignment
6703            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6704        return dr_unaligned_supported;
6705     }
6706
6707   /* Unsupported.  */
6708   return dr_unaligned_unsupported;
6709 }