gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode, array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   if (!targetm.array_mode (mode, count).exists (&array_mode))
  69     {
  70       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  71       limit_p = !targetm.array_mode_supported_p (mode, count);
  72       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  73         {
  74           if (dump_enabled_p ())
  75             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  76                              "no array mode for %s[%wu]\n",
  77                              GET_MODE_NAME (mode), count);
  78           return false;
  79         }
  80     }
  81
  82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  83     {
  84       if (dump_enabled_p ())
  85         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  86                          "cannot use %s<%s><%s>\n", name,
  87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  88       return false;
  89     }
  90
  91   if (dump_enabled_p ())
  92     dump_printf_loc (MSG_NOTE, vect_location,
  93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  94                      GET_MODE_NAME (mode));
  95
  96   return true;
  97 }
  98
  99
 100 /* Return the smallest scalar part of STMT_INFO.
 101    This is used to determine the vectype of the stmt.  We generally set the
 102    vectype according to the type of the result (lhs).  For stmts whose
 103    result-type is different than the type of the arguments (e.g., demotion,
 104    promotion), vectype will be reset appropriately (later).  Note that we have
 105    to visit the smallest datatype in this function, because that determines the
 106    VF.  If the smallest datatype in the loop is present only as the rhs of a
 107    promotion operation - we'd miss it.
 108    Such a case, where a variable of this datatype does not appear in the lhs
 109    anywhere in the loop, can only occur if it's an invariant: e.g.:
 110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 111    invariant motion.  However, we cannot rely on invariant motion to always
 112    take invariants out of the loop, and so in the case of promotion we also
 113    have to check the rhs.
 114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 115    types.  */
 116
 117 tree
 118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 119                                HOST_WIDE_INT *lhs_size_unit,
 120                                HOST_WIDE_INT *rhs_size_unit)
 121 {
 122   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 123   HOST_WIDE_INT lhs, rhs;
 124
 125   /* During the analysis phase, this function is called on arbitrary
 126      statements that might not have scalar results.  */
 127   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 128     return scalar_type;
 129
 130   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 131
 132   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 133   if (assign
 134       && (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 148     {
 149       unsigned int i = 0;
 150       if (gimple_call_internal_p (call))
 151         {
 152           internal_fn ifn = gimple_call_internal_fn (call);
 153           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 154             /* gimple_expr_type already picked the type of the loaded
 155                or stored data.  */
 156             i = ~0U;
 157           else if (internal_fn_mask_index (ifn) == 0)
 158             i = 1;
 159         }
 160       if (i < gimple_call_num_args (call))
 161         {
 162           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 163           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 164             {
 165               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 166               if (rhs < lhs)
 167                 scalar_type = rhs_type;
 168             }
 169         }
 170     }
 171
 172   *lhs_size_unit = lhs;
 173   *rhs_size_unit = rhs;
 174   return scalar_type;
 175 }
 176
 177
 178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 179    tested at run-time.  Return TRUE if DDR was successfully inserted.
 180    Return false if versioning is not supported.  */
 181
 182 static opt_result
 183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 184 {
 185   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186
 187   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 188     return opt_result::failure_at (vect_location,
 189                                    "will not create alias checks, as"
 190                                    " --param vect-max-version-for-alias-checks"
 191                                    " == 0\n");
 192
 193   opt_result res
 194     = runtime_alias_check_p (ddr, loop,
 195                              optimize_loop_nest_for_speed_p (loop));
 196   if (!res)
 197     return res;
 198
 199   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 200   return opt_result::success ();
 201 }
 202
 203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 204
 205 static void
 206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 207 {
 208   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 209   for (unsigned int i = 0; i < checks.length(); ++i)
 210     if (checks[i] == value)
 211       return;
 212
 213   if (dump_enabled_p ())
 214     dump_printf_loc (MSG_NOTE, vect_location,
 215                      "need run-time check that %T is nonzero\n",
 216                      value);
 217   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 218 }
 219
 220 /* Return true if we know that the order of vectorized DR_INFO_A and
 221    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 222    DR_INFO_B.  At least one of the accesses is a write.  */
 223
 224 static bool
 225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 226 {
 227   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 228   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 229
 230   /* Single statements are always kept in their original order.  */
 231   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 232       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 233     return true;
 234
 235   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 236      SLP group are emitted at the position of the last scalar load and
 237      all loads in an interleaving group are emitted at the position
 238      of the first scalar load.
 239      Stores in a group are emitted at the position of the last scalar store.
 240      Compute that position and check whether the resulting order matches
 241      the current one.
 242      We have not yet decided between SLP and interleaving so we have
 243      to conservatively assume both.  */
 244   stmt_vec_info il_a;
 245   stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 246   if (last_a)
 247     {
 248       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 249            s = DR_GROUP_NEXT_ELEMENT (s))
 250         last_a = get_later_stmt (last_a, s);
 251       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 252         {
 253           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 254                s = DR_GROUP_NEXT_ELEMENT (s))
 255             if (get_later_stmt (il_a, s) == il_a)
 256               il_a = s;
 257         }
 258       else
 259         il_a = last_a;
 260     }
 261   else
 262     last_a = il_a = stmtinfo_a;
 263   stmt_vec_info il_b;
 264   stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 265   if (last_b)
 266     {
 267       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 268            s = DR_GROUP_NEXT_ELEMENT (s))
 269         last_b = get_later_stmt (last_b, s);
 270       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 271         {
 272           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 273                s = DR_GROUP_NEXT_ELEMENT (s))
 274             if (get_later_stmt (il_b, s) == il_b)
 275               il_b = s;
 276         }
 277       else
 278         il_b = last_b;
 279     }
 280   else
 281     last_b = il_b = stmtinfo_b;
 282   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 283   return (/* SLP */
 284           (get_later_stmt (last_a, last_b) == last_a) == a_after_b
 285           /* Interleaving */
 286           && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
 287           /* Mixed */
 288           && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
 289           && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
 290 }
 291
 292 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 293    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 294    distances.  These distances are conservatively correct but they don't
 295    reflect a guaranteed dependence.
 296
 297    Return true if this function does all the work necessary to avoid
 298    an alias or false if the caller should use the dependence distances
 299    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 300    the depth of the loop described by LOOP_VINFO and the other arguments
 301    are as for vect_analyze_data_ref_dependence.  */
 302
 303 static bool
 304 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 305                                        loop_vec_info loop_vinfo,
 306                                        int loop_depth, unsigned int *max_vf)
 307 {
 308   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 309   lambda_vector dist_v;
 310   unsigned int i;
 311   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 312     {
 313       int dist = dist_v[loop_depth];
 314       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 315         {
 316           /* If the user asserted safelen >= DIST consecutive iterations
 317              can be executed concurrently, assume independence.
 318
 319              ??? An alternative would be to add the alias check even
 320              in this case, and vectorize the fallback loop with the
 321              maximum VF set to safelen.  However, if the user has
 322              explicitly given a length, it's less likely that that
 323              would be a win.  */
 324           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 325             {
 326               if ((unsigned int) loop->safelen < *max_vf)
 327                 *max_vf = loop->safelen;
 328               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 329               continue;
 330             }
 331
 332           /* For dependence distances of 2 or more, we have the option
 333              of limiting VF or checking for an alias at runtime.
 334              Prefer to check at runtime if we can, to avoid limiting
 335              the VF unnecessarily when the bases are in fact independent.
 336
 337              Note that the alias checks will be removed if the VF ends up
 338              being small enough.  */
 339           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 340           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 341           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 342                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 343                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 344         }
 345     }
 346   return true;
 347 }
 348
 349
 350 /* Function vect_analyze_data_ref_dependence.
 351
 352    FIXME: I needed to change the sense of the returned flag.
 353
 354    Return FALSE if there (might) exist a dependence between a memory-reference
 355    DRA and a memory-reference DRB.  When versioning for alias may check a
 356    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 357    the data dependence.  */
 358
 359 static opt_result
 360 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 361                                   loop_vec_info loop_vinfo,
 362                                   unsigned int *max_vf)
 363 {
 364   unsigned int i;
 365   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 366   struct data_reference *dra = DDR_A (ddr);
 367   struct data_reference *drb = DDR_B (ddr);
 368   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 369   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 370   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 371   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 372   lambda_vector dist_v;
 373   unsigned int loop_depth;
 374
 375   /* In loop analysis all data references should be vectorizable.  */
 376   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 377       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 378     gcc_unreachable ();
 379
 380   /* Independent data accesses.  */
 381   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 382     return opt_result::success ();
 383
 384   if (dra == drb
 385       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 386     return opt_result::success ();
 387
 388   /* We do not have to consider dependences between accesses that belong
 389      to the same group, unless the stride could be smaller than the
 390      group size.  */
 391   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 392       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 394       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 395     return opt_result::success ();
 396
 397   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 398      least two scalar iterations, there is always also a true dependence.
 399      As the vectorizer does not re-order loads and stores we can ignore
 400      the anti-dependence if TBAA can disambiguate both DRs similar to the
 401      case with known negative distance anti-dependences (positive
 402      distance anti-dependences would violate TBAA constraints).  */
 403   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 404        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 405       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 406                                  get_alias_set (DR_REF (drb))))
 407     return opt_result::success ();
 408
 409   /* Unknown data dependence.  */
 410   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 411     {
 412       /* If user asserted safelen consecutive iterations can be
 413          executed concurrently, assume independence.  */
 414       if (loop->safelen >= 2)
 415         {
 416           if ((unsigned int) loop->safelen < *max_vf)
 417             *max_vf = loop->safelen;
 418           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 419           return opt_result::success ();
 420         }
 421
 422       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 423           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 424         return opt_result::failure_at
 425           (stmtinfo_a->stmt,
 426            "versioning for alias not supported for: "
 427            "can't determine dependence between %T and %T\n",
 428            DR_REF (dra), DR_REF (drb));
 429
 430       if (dump_enabled_p ())
 431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 432                          "versioning for alias required: "
 433                          "can't determine dependence between %T and %T\n",
 434                          DR_REF (dra), DR_REF (drb));
 435
 436       /* Add to list of ddrs that need to be tested at run-time.  */
 437       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 438     }
 439
 440   /* Known data dependence.  */
 441   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 442     {
 443       /* If user asserted safelen consecutive iterations can be
 444          executed concurrently, assume independence.  */
 445       if (loop->safelen >= 2)
 446         {
 447           if ((unsigned int) loop->safelen < *max_vf)
 448             *max_vf = loop->safelen;
 449           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 450           return opt_result::success ();
 451         }
 452
 453       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 454           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 455         return opt_result::failure_at
 456           (stmtinfo_a->stmt,
 457            "versioning for alias not supported for: "
 458            "bad dist vector for %T and %T\n",
 459            DR_REF (dra), DR_REF (drb));
 460
 461       if (dump_enabled_p ())
 462         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 463                          "versioning for alias required: "
 464                          "bad dist vector for %T and %T\n",
 465                          DR_REF (dra), DR_REF (drb));
 466       /* Add to list of ddrs that need to be tested at run-time.  */
 467       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 468     }
 469
 470   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 471
 472   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 473       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 474                                                 loop_depth, max_vf))
 475     return opt_result::success ();
 476
 477   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 478     {
 479       int dist = dist_v[loop_depth];
 480
 481       if (dump_enabled_p ())
 482         dump_printf_loc (MSG_NOTE, vect_location,
 483                          "dependence distance  = %d.\n", dist);
 484
 485       if (dist == 0)
 486         {
 487           if (dump_enabled_p ())
 488             dump_printf_loc (MSG_NOTE, vect_location,
 489                              "dependence distance == 0 between %T and %T\n",
 490                              DR_REF (dra), DR_REF (drb));
 491
 492           /* When we perform grouped accesses and perform implicit CSE
 493              by detecting equal accesses and doing disambiguation with
 494              runtime alias tests like for
 495                 .. = a[i];
 496                 .. = a[i+1];
 497                 a[i] = ..;
 498                 a[i+1] = ..;
 499                 *p = ..;
 500                 .. = a[i];
 501                 .. = a[i+1];
 502              where we will end up loading { a[i], a[i+1] } once, make
 503              sure that inserting group loads before the first load and
 504              stores after the last store will do the right thing.
 505              Similar for groups like
 506                 a[i] = ...;
 507                 ... = a[i];
 508                 a[i+1] = ...;
 509              where loads from the group interleave with the store.  */
 510           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 511             return opt_result::failure_at (stmtinfo_a->stmt,
 512                                            "READ_WRITE dependence"
 513                                            " in interleaving.\n");
 514
 515           if (loop->safelen < 2)
 516             {
 517               tree indicator = dr_zero_step_indicator (dra);
 518               if (!indicator || integer_zerop (indicator))
 519                 return opt_result::failure_at (stmtinfo_a->stmt,
 520                                                "access also has a zero step\n");
 521               else if (TREE_CODE (indicator) != INTEGER_CST)
 522                 vect_check_nonzero_value (loop_vinfo, indicator);
 523             }
 524           continue;
 525         }
 526
 527       if (dist > 0 && DDR_REVERSED_P (ddr))
 528         {
 529           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 530              reversed (to make distance vector positive), and the actual
 531              distance is negative.  */
 532           if (dump_enabled_p ())
 533             dump_printf_loc (MSG_NOTE, vect_location,
 534                              "dependence distance negative.\n");
 535           /* When doing outer loop vectorization, we need to check if there is
 536              a backward dependence at the inner loop level if the dependence
 537              at the outer loop is reversed.  See PR81740.  */
 538           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 539               || nested_in_vect_loop_p (loop, stmtinfo_b))
 540             {
 541               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 542                                                          DDR_LOOP_NEST (ddr));
 543               if (dist_v[inner_depth] < 0)
 544                 return opt_result::failure_at (stmtinfo_a->stmt,
 545                                                "not vectorized, dependence "
 546                                                "between data-refs %T and %T\n",
 547                                                DR_REF (dra), DR_REF (drb));
 548             }
 549           /* Record a negative dependence distance to later limit the
 550              amount of stmt copying / unrolling we can perform.
 551              Only need to handle read-after-write dependence.  */
 552           if (DR_IS_READ (drb)
 553               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 554                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 555             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 556           continue;
 557         }
 558
 559       unsigned int abs_dist = abs (dist);
 560       if (abs_dist >= 2 && abs_dist < *max_vf)
 561         {
 562           /* The dependence distance requires reduction of the maximal
 563              vectorization factor.  */
 564           *max_vf = abs_dist;
 565           if (dump_enabled_p ())
 566             dump_printf_loc (MSG_NOTE, vect_location,
 567                              "adjusting maximal vectorization factor to %i\n",
 568                              *max_vf);
 569         }
 570
 571       if (abs_dist >= *max_vf)
 572         {
 573           /* Dependence distance does not create dependence, as far as
 574              vectorization is concerned, in this case.  */
 575           if (dump_enabled_p ())
 576             dump_printf_loc (MSG_NOTE, vect_location,
 577                              "dependence distance >= VF.\n");
 578           continue;
 579         }
 580
 581       return opt_result::failure_at (stmtinfo_a->stmt,
 582                                      "not vectorized, possible dependence "
 583                                      "between data-refs %T and %T\n",
 584                                      DR_REF (dra), DR_REF (drb));
 585     }
 586
 587   return opt_result::success ();
 588 }
 589
 590 /* Function vect_analyze_data_ref_dependences.
 591
 592    Examine all the data references in the loop, and make sure there do not
 593    exist any data dependences between them.  Set *MAX_VF according to
 594    the maximum vectorization factor the data dependences allow.  */
 595
 596 opt_result
 597 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 598                                    unsigned int *max_vf)
 599 {
 600   unsigned int i;
 601   struct data_dependence_relation *ddr;
 602
 603   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 604
 605   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 606     {
 607       LOOP_VINFO_DDRS (loop_vinfo)
 608         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 609                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 610       /* We need read-read dependences to compute
 611          STMT_VINFO_SAME_ALIGN_REFS.  */
 612       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 613                                           &LOOP_VINFO_DDRS (loop_vinfo),
 614                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 615                                           true);
 616       gcc_assert (res);
 617     }
 618
 619   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 620
 621   /* For epilogues we either have no aliases or alias versioning
 622      was applied to original loop.  Therefore we may just get max_vf
 623      using VF of original loop.  */
 624   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 625     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 626   else
 627     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 628       {
 629         opt_result res
 630           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 631         if (!res)
 632           return res;
 633       }
 634
 635   return opt_result::success ();
 636 }
 637
 638
 639 /* Function vect_slp_analyze_data_ref_dependence.
 640
 641    Return TRUE if there (might) exist a dependence between a memory-reference
 642    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 643    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 644    according to the data dependence.  */
 645
 646 static bool
 647 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 648                                       struct data_dependence_relation *ddr)
 649 {
 650   struct data_reference *dra = DDR_A (ddr);
 651   struct data_reference *drb = DDR_B (ddr);
 652   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 653   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 654
 655   /* We need to check dependences of statements marked as unvectorizable
 656      as well, they still can prohibit vectorization.  */
 657
 658   /* Independent data accesses.  */
 659   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 660     return false;
 661
 662   if (dra == drb)
 663     return false;
 664
 665   /* Read-read is OK.  */
 666   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 667     return false;
 668
 669   /* If dra and drb are part of the same interleaving chain consider
 670      them independent.  */
 671   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 672       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 673           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 674     return false;
 675
 676   /* Unknown data dependence.  */
 677   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 678     {
 679       if  (dump_enabled_p ())
 680         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 681                          "can't determine dependence between %T and %T\n",
 682                          DR_REF (dra), DR_REF (drb));
 683     }
 684   else if (dump_enabled_p ())
 685     dump_printf_loc (MSG_NOTE, vect_location,
 686                      "determined dependence between %T and %T\n",
 687                      DR_REF (dra), DR_REF (drb));
 688
 689   return true;
 690 }
 691
 692
 693 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 694    contain the vector of scalar stores of this instance if we are
 695    disambiguating the loads.  */
 696
 697 static bool
 698 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 699                                    vec<stmt_vec_info> stores,
 700                                    stmt_vec_info last_store_info)
 701 {
 702   /* This walks over all stmts involved in the SLP load/store done
 703      in NODE verifying we can sink them up to the last stmt in the
 704      group.  */
 705   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 706   vec_info *vinfo = last_access_info->vinfo;
 707   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 708     {
 709       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 710       if (access_info == last_access_info)
 711         continue;
 712       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 713       ao_ref ref;
 714       bool ref_initialized_p = false;
 715       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 716            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 717         {
 718           gimple *stmt = gsi_stmt (gsi);
 719           if (! gimple_vuse (stmt)
 720               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 721             continue;
 722
 723           /* If we couldn't record a (single) data reference for this
 724              stmt we have to resort to the alias oracle.  */
 725           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 726           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 727           if (!dr_b)
 728             {
 729               /* We are moving a store or sinking a load - this means
 730                  we cannot use TBAA for disambiguation.  */
 731               if (!ref_initialized_p)
 732                 ao_ref_init (&ref, DR_REF (dr_a));
 733               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 734                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 735                 return false;
 736               continue;
 737             }
 738
 739           bool dependent = false;
 740           /* If we run into a store of this same instance (we've just
 741              marked those) then delay dependence checking until we run
 742              into the last store because this is where it will have
 743              been sunk to (and we verify if we can do that as well).  */
 744           if (gimple_visited_p (stmt))
 745             {
 746               if (stmt_info != last_store_info)
 747                 continue;
 748               unsigned i;
 749               stmt_vec_info store_info;
 750               FOR_EACH_VEC_ELT (stores, i, store_info)
 751                 {
 752                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 753                   ddr_p ddr = initialize_data_dependence_relation
 754                                 (dr_a, store_dr, vNULL);
 755                   dependent
 756                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 757                   free_dependence_relation (ddr);
 758                   if (dependent)
 759                     break;
 760                 }
 761             }
 762           else
 763             {
 764               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 765                                                                dr_b, vNULL);
 766               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 767               free_dependence_relation (ddr);
 768             }
 769           if (dependent)
 770             return false;
 771         }
 772     }
 773   return true;
 774 }
 775
 776
 777 /* Function vect_analyze_data_ref_dependences.
 778
 779    Examine all the data references in the basic-block, and make sure there
 780    do not exist any data dependences between them.  Set *MAX_VF according to
 781    the maximum vectorization factor the data dependences allow.  */
 782
 783 bool
 784 vect_slp_analyze_instance_dependence (slp_instance instance)
 785 {
 786   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 787
 788   /* The stores of this instance are at the root of the SLP tree.  */
 789   slp_tree store = SLP_INSTANCE_TREE (instance);
 790   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 791     store = NULL;
 792
 793   /* Verify we can sink stores to the vectorized stmt insert location.  */
 794   stmt_vec_info last_store_info = NULL;
 795   if (store)
 796     {
 797       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 798         return false;
 799
 800       /* Mark stores in this instance and remember the last one.  */
 801       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 802       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 803         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 804     }
 805
 806   bool res = true;
 807
 808   /* Verify we can sink loads to the vectorized stmt insert location,
 809      special-casing stores of this instance.  */
 810   slp_tree load;
 811   unsigned int i;
 812   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 813     if (! vect_slp_analyze_node_dependences (instance, load,
 814                                              store
 815                                              ? SLP_TREE_SCALAR_STMTS (store)
 816                                              : vNULL, last_store_info))
 817       {
 818         res = false;
 819         break;
 820       }
 821
 822   /* Unset the visited flag.  */
 823   if (store)
 824     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 825       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 826
 827   return res;
 828 }
 829
 830 /* Record the base alignment guarantee given by DRB, which occurs
 831    in STMT_INFO.  */
 832
 833 static void
 834 vect_record_base_alignment (stmt_vec_info stmt_info,
 835                             innermost_loop_behavior *drb)
 836 {
 837   vec_info *vinfo = stmt_info->vinfo;
 838   bool existed;
 839   innermost_loop_behavior *&entry
 840     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 841   if (!existed || entry->base_alignment < drb->base_alignment)
 842     {
 843       entry = drb;
 844       if (dump_enabled_p ())
 845         dump_printf_loc (MSG_NOTE, vect_location,
 846                          "recording new base alignment for %T\n"
 847                          "  alignment:    %d\n"
 848                          "  misalignment: %d\n"
 849                          "  based on:     %G",
 850                          drb->base_address,
 851                          drb->base_alignment,
 852                          drb->base_misalignment,
 853                          stmt_info->stmt);
 854     }
 855 }
 856
 857 /* If the region we're going to vectorize is reached, all unconditional
 858    data references occur at least once.  We can therefore pool the base
 859    alignment guarantees from each unconditional reference.  Do this by
 860    going through all the data references in VINFO and checking whether
 861    the containing statement makes the reference unconditionally.  If so,
 862    record the alignment of the base address in VINFO so that it can be
 863    used for all other references with the same base.  */
 864
 865 void
 866 vect_record_base_alignments (vec_info *vinfo)
 867 {
 868   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 869   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 870   data_reference *dr;
 871   unsigned int i;
 872   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 873     {
 874       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 875       stmt_vec_info stmt_info = dr_info->stmt;
 876       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 877           && STMT_VINFO_VECTORIZABLE (stmt_info)
 878           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 879         {
 880           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 881
 882           /* If DR is nested in the loop that is being vectorized, we can also
 883              record the alignment of the base wrt the outer loop.  */
 884           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 885             vect_record_base_alignment
 886               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 887         }
 888     }
 889 }
 890
 891 /* Return the target alignment for the vectorized form of DR_INFO.  */
 892
 893 static poly_uint64
 894 vect_calculate_target_alignment (dr_vec_info *dr_info)
 895 {
 896   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 897   return targetm.vectorize.preferred_vector_alignment (vectype);
 898 }
 899
 900 /* Function vect_compute_data_ref_alignment
 901
 902    Compute the misalignment of the data reference DR_INFO.
 903
 904    Output:
 905    1. DR_MISALIGNMENT (DR_INFO) is defined.
 906
 907    FOR NOW: No analysis is actually performed. Misalignment is calculated
 908    only for trivial cases. TODO.  */
 909
 910 static void
 911 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 912 {
 913   stmt_vec_info stmt_info = dr_info->stmt;
 914   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 915   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 916   class loop *loop = NULL;
 917   tree ref = DR_REF (dr_info->dr);
 918   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 919
 920   if (dump_enabled_p ())
 921     dump_printf_loc (MSG_NOTE, vect_location,
 922                      "vect_compute_data_ref_alignment:\n");
 923
 924   if (loop_vinfo)
 925     loop = LOOP_VINFO_LOOP (loop_vinfo);
 926
 927   /* Initialize misalignment to unknown.  */
 928   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 929
 930   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 931     return;
 932
 933   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 934   bool step_preserves_misalignment_p;
 935
 936   poly_uint64 vector_alignment
 937     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 938   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 939
 940   /* If the main loop has peeled for alignment we have no way of knowing
 941      whether the data accesses in the epilogues are aligned.  We can't at
 942      compile time answer the question whether we have entered the main loop or
 943      not.  Fixes PR 92351.  */
 944   if (loop_vinfo)
 945     {
 946       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
 947       if (orig_loop_vinfo
 948           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
 949         return;
 950     }
 951
 952   unsigned HOST_WIDE_INT vect_align_c;
 953   if (!vector_alignment.is_constant (&vect_align_c))
 954     return;
 955
 956   /* No step for BB vectorization.  */
 957   if (!loop)
 958     {
 959       gcc_assert (integer_zerop (drb->step));
 960       step_preserves_misalignment_p = true;
 961     }
 962
 963   /* In case the dataref is in an inner-loop of the loop that is being
 964      vectorized (LOOP), we use the base and misalignment information
 965      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 966      stays the same throughout the execution of the inner-loop, which is why
 967      we have to check that the stride of the dataref in the inner-loop evenly
 968      divides by the vector alignment.  */
 969   else if (nested_in_vect_loop_p (loop, stmt_info))
 970     {
 971       step_preserves_misalignment_p
 972         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 973
 974       if (dump_enabled_p ())
 975         {
 976           if (step_preserves_misalignment_p)
 977             dump_printf_loc (MSG_NOTE, vect_location,
 978                              "inner step divides the vector alignment.\n");
 979           else
 980             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 981                              "inner step doesn't divide the vector"
 982                              " alignment.\n");
 983         }
 984     }
 985
 986   /* Similarly we can only use base and misalignment information relative to
 987      an innermost loop if the misalignment stays the same throughout the
 988      execution of the loop.  As above, this is the case if the stride of
 989      the dataref evenly divides by the alignment.  */
 990   else
 991     {
 992       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 993       step_preserves_misalignment_p
 994         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 995
 996       if (!step_preserves_misalignment_p && dump_enabled_p ())
 997         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 998                          "step doesn't divide the vector alignment.\n");
 999     }
1000
1001   unsigned int base_alignment = drb->base_alignment;
1002   unsigned int base_misalignment = drb->base_misalignment;
1003
1004   /* Calculate the maximum of the pooled base address alignment and the
1005      alignment that we can compute for DR itself.  */
1006   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1007   if (entry && base_alignment < (*entry)->base_alignment)
1008     {
1009       base_alignment = (*entry)->base_alignment;
1010       base_misalignment = (*entry)->base_misalignment;
1011     }
1012
1013   if (drb->offset_alignment < vect_align_c
1014       || !step_preserves_misalignment_p
1015       /* We need to know whether the step wrt the vectorized loop is
1016          negative when computing the starting misalignment below.  */
1017       || TREE_CODE (drb->step) != INTEGER_CST)
1018     {
1019       if (dump_enabled_p ())
1020         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021                          "Unknown alignment for access: %T\n", ref);
1022       return;
1023     }
1024
1025   if (base_alignment < vect_align_c)
1026     {
1027       unsigned int max_alignment;
1028       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1029       if (max_alignment < vect_align_c
1030           || !vect_can_force_dr_alignment_p (base,
1031                                              vect_align_c * BITS_PER_UNIT))
1032         {
1033           if (dump_enabled_p ())
1034             dump_printf_loc (MSG_NOTE, vect_location,
1035                              "can't force alignment of ref: %T\n", ref);
1036           return;
1037         }
1038
1039       /* Force the alignment of the decl.
1040          NOTE: This is the only change to the code we make during
1041          the analysis phase, before deciding to vectorize the loop.  */
1042       if (dump_enabled_p ())
1043         dump_printf_loc (MSG_NOTE, vect_location,
1044                          "force alignment of %T\n", ref);
1045
1046       dr_info->base_decl = base;
1047       dr_info->base_misaligned = true;
1048       base_misalignment = 0;
1049     }
1050   poly_int64 misalignment
1051     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1052
1053   /* If this is a backward running DR then first access in the larger
1054      vectype actually is N-1 elements before the address in the DR.
1055      Adjust misalign accordingly.  */
1056   if (tree_int_cst_sgn (drb->step) < 0)
1057     /* PLUS because STEP is negative.  */
1058     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1059                      * TREE_INT_CST_LOW (drb->step));
1060
1061   unsigned int const_misalignment;
1062   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1063     {
1064       if (dump_enabled_p ())
1065         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066                          "Non-constant misalignment for access: %T\n", ref);
1067       return;
1068     }
1069
1070   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1071
1072   if (dump_enabled_p ())
1073     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074                      "misalign = %d bytes of ref %T\n",
1075                      DR_MISALIGNMENT (dr_info), ref);
1076
1077   return;
1078 }
1079
1080 /* Function vect_update_misalignment_for_peel.
1081    Sets DR_INFO's misalignment
1082    - to 0 if it has the same alignment as DR_PEEL_INFO,
1083    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1084    - to -1 (unknown) otherwise.
1085
1086    DR_INFO - the data reference whose misalignment is to be adjusted.
1087    DR_PEEL_INFO - the data reference whose misalignment is being made
1088                   zero in the vector loop by the peel.
1089    NPEEL - the number of iterations in the peel loop if the misalignment
1090            of DR_PEEL_INFO is known at compile time.  */
1091
1092 static void
1093 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1094                                    dr_vec_info *dr_peel_info, int npeel)
1095 {
1096   unsigned int i;
1097   vec<dr_p> same_aligned_drs;
1098   struct data_reference *current_dr;
1099   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1100
1101   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1102      it is aligned in the vector loop.  */
1103   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1104   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1105     {
1106       if (current_dr != dr_info->dr)
1107         continue;
1108       gcc_assert (!known_alignment_for_access_p (dr_info)
1109                   || !known_alignment_for_access_p (dr_peel_info)
1110                   || (DR_MISALIGNMENT (dr_info)
1111                       == DR_MISALIGNMENT (dr_peel_info)));
1112       SET_DR_MISALIGNMENT (dr_info, 0);
1113       return;
1114     }
1115
1116   unsigned HOST_WIDE_INT alignment;
1117   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1118       && known_alignment_for_access_p (dr_info)
1119       && known_alignment_for_access_p (dr_peel_info))
1120     {
1121       int misal = DR_MISALIGNMENT (dr_info);
1122       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1123       misal &= alignment - 1;
1124       SET_DR_MISALIGNMENT (dr_info, misal);
1125       return;
1126     }
1127
1128   if (dump_enabled_p ())
1129     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1130                      "to unknown (-1).\n");
1131   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1132 }
1133
1134
1135 /* Function verify_data_ref_alignment
1136
1137    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1138
1139 static opt_result
1140 verify_data_ref_alignment (dr_vec_info *dr_info)
1141 {
1142   enum dr_alignment_support supportable_dr_alignment
1143     = vect_supportable_dr_alignment (dr_info, false);
1144   if (!supportable_dr_alignment)
1145     return opt_result::failure_at
1146       (dr_info->stmt->stmt,
1147        DR_IS_READ (dr_info->dr)
1148         ? "not vectorized: unsupported unaligned load: %T\n"
1149         : "not vectorized: unsupported unaligned store: %T\n",
1150        DR_REF (dr_info->dr));
1151
1152   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1153     dump_printf_loc (MSG_NOTE, vect_location,
1154                      "Vectorizing an unaligned access.\n");
1155
1156   return opt_result::success ();
1157 }
1158
1159 /* Function vect_verify_datarefs_alignment
1160
1161    Return TRUE if all data references in the loop can be
1162    handled with respect to alignment.  */
1163
1164 opt_result
1165 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1166 {
1167   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1168   struct data_reference *dr;
1169   unsigned int i;
1170
1171   FOR_EACH_VEC_ELT (datarefs, i, dr)
1172     {
1173       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1174       stmt_vec_info stmt_info = dr_info->stmt;
1175
1176       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1177         continue;
1178
1179       /* For interleaving, only the alignment of the first access matters.   */
1180       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1181           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1182         continue;
1183
1184       /* Strided accesses perform only component accesses, alignment is
1185          irrelevant for them.  */
1186       if (STMT_VINFO_STRIDED_P (stmt_info)
1187           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1188         continue;
1189
1190       opt_result res = verify_data_ref_alignment (dr_info);
1191       if (!res)
1192         return res;
1193     }
1194
1195   return opt_result::success ();
1196 }
1197
1198 /* Given an memory reference EXP return whether its alignment is less
1199    than its size.  */
1200
1201 static bool
1202 not_size_aligned (tree exp)
1203 {
1204   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1205     return true;
1206
1207   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1208           > get_object_alignment (exp));
1209 }
1210
1211 /* Function vector_alignment_reachable_p
1212
1213    Return true if vector alignment for DR_INFO is reachable by peeling
1214    a few loop iterations.  Return false otherwise.  */
1215
1216 static bool
1217 vector_alignment_reachable_p (dr_vec_info *dr_info)
1218 {
1219   stmt_vec_info stmt_info = dr_info->stmt;
1220   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1221
1222   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1223     {
1224       /* For interleaved access we peel only if number of iterations in
1225          the prolog loop ({VF - misalignment}), is a multiple of the
1226          number of the interleaved accesses.  */
1227       int elem_size, mis_in_elements;
1228
1229       /* FORNOW: handle only known alignment.  */
1230       if (!known_alignment_for_access_p (dr_info))
1231         return false;
1232
1233       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1234       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1235       elem_size = vector_element_size (vector_size, nelements);
1236       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1237
1238       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1239         return false;
1240     }
1241
1242   /* If misalignment is known at the compile time then allow peeling
1243      only if natural alignment is reachable through peeling.  */
1244   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1245     {
1246       HOST_WIDE_INT elmsize =
1247                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1248       if (dump_enabled_p ())
1249         {
1250           dump_printf_loc (MSG_NOTE, vect_location,
1251                            "data size = %wd. misalignment = %d.\n", elmsize,
1252                            DR_MISALIGNMENT (dr_info));
1253         }
1254       if (DR_MISALIGNMENT (dr_info) % elmsize)
1255         {
1256           if (dump_enabled_p ())
1257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258                              "data size does not divide the misalignment.\n");
1259           return false;
1260         }
1261     }
1262
1263   if (!known_alignment_for_access_p (dr_info))
1264     {
1265       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1266       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1267       if (dump_enabled_p ())
1268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269                          "Unknown misalignment, %snaturally aligned\n",
1270                          is_packed ? "not " : "");
1271       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1272     }
1273
1274   return true;
1275 }
1276
1277
1278 /* Calculate the cost of the memory access represented by DR_INFO.  */
1279
1280 static void
1281 vect_get_data_access_cost (dr_vec_info *dr_info,
1282                            unsigned int *inside_cost,
1283                            unsigned int *outside_cost,
1284                            stmt_vector_for_cost *body_cost_vec,
1285                            stmt_vector_for_cost *prologue_cost_vec)
1286 {
1287   stmt_vec_info stmt_info = dr_info->stmt;
1288   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1289   int ncopies;
1290
1291   if (PURE_SLP_STMT (stmt_info))
1292     ncopies = 1;
1293   else
1294     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1295
1296   if (DR_IS_READ (dr_info->dr))
1297     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1298                         prologue_cost_vec, body_cost_vec, false);
1299   else
1300     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1301
1302   if (dump_enabled_p ())
1303     dump_printf_loc (MSG_NOTE, vect_location,
1304                      "vect_get_data_access_cost: inside_cost = %d, "
1305                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1306 }
1307
1308
1309 typedef struct _vect_peel_info
1310 {
1311   dr_vec_info *dr_info;
1312   int npeel;
1313   unsigned int count;
1314 } *vect_peel_info;
1315
1316 typedef struct _vect_peel_extended_info
1317 {
1318   struct _vect_peel_info peel_info;
1319   unsigned int inside_cost;
1320   unsigned int outside_cost;
1321 } *vect_peel_extended_info;
1322
1323
1324 /* Peeling hashtable helpers.  */
1325
1326 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1327 {
1328   static inline hashval_t hash (const _vect_peel_info *);
1329   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1330 };
1331
1332 inline hashval_t
1333 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1334 {
1335   return (hashval_t) peel_info->npeel;
1336 }
1337
1338 inline bool
1339 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1340 {
1341   return (a->npeel == b->npeel);
1342 }
1343
1344
1345 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1346
1347 static void
1348 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1349                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1350                           int npeel)
1351 {
1352   struct _vect_peel_info elem, *slot;
1353   _vect_peel_info **new_slot;
1354   bool supportable_dr_alignment
1355     = vect_supportable_dr_alignment (dr_info, true);
1356
1357   elem.npeel = npeel;
1358   slot = peeling_htab->find (&elem);
1359   if (slot)
1360     slot->count++;
1361   else
1362     {
1363       slot = XNEW (struct _vect_peel_info);
1364       slot->npeel = npeel;
1365       slot->dr_info = dr_info;
1366       slot->count = 1;
1367       new_slot = peeling_htab->find_slot (slot, INSERT);
1368       *new_slot = slot;
1369     }
1370
1371   if (!supportable_dr_alignment
1372       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1373     slot->count += VECT_MAX_COST;
1374 }
1375
1376
1377 /* Traverse peeling hash table to find peeling option that aligns maximum
1378    number of data accesses.  */
1379
1380 int
1381 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1382                                      _vect_peel_extended_info *max)
1383 {
1384   vect_peel_info elem = *slot;
1385
1386   if (elem->count > max->peel_info.count
1387       || (elem->count == max->peel_info.count
1388           && max->peel_info.npeel > elem->npeel))
1389     {
1390       max->peel_info.npeel = elem->npeel;
1391       max->peel_info.count = elem->count;
1392       max->peel_info.dr_info = elem->dr_info;
1393     }
1394
1395   return 1;
1396 }
1397
1398 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1399    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1400    we assume DR0_INFO's misalignment will be zero after peeling.  */
1401
1402 static void
1403 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1404                                 dr_vec_info *dr0_info,
1405                                 unsigned int *inside_cost,
1406                                 unsigned int *outside_cost,
1407                                 stmt_vector_for_cost *body_cost_vec,
1408                                 stmt_vector_for_cost *prologue_cost_vec,
1409                                 unsigned int npeel,
1410                                 bool unknown_misalignment)
1411 {
1412   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1413   unsigned i;
1414   data_reference *dr;
1415
1416   FOR_EACH_VEC_ELT (datarefs, i, dr)
1417     {
1418       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1419       stmt_vec_info stmt_info = dr_info->stmt;
1420       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1421         continue;
1422
1423       /* For interleaving, only the alignment of the first access
1424          matters.  */
1425       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1426           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1427         continue;
1428
1429       /* Strided accesses perform only component accesses, alignment is
1430          irrelevant for them.  */
1431       if (STMT_VINFO_STRIDED_P (stmt_info)
1432           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1433         continue;
1434
1435       int save_misalignment;
1436       save_misalignment = DR_MISALIGNMENT (dr_info);
1437       if (npeel == 0)
1438         ;
1439       else if (unknown_misalignment && dr_info == dr0_info)
1440         SET_DR_MISALIGNMENT (dr_info, 0);
1441       else
1442         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1443       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1444                                  body_cost_vec, prologue_cost_vec);
1445       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1446     }
1447 }
1448
1449 /* Traverse peeling hash table and calculate cost for each peeling option.
1450    Find the one with the lowest cost.  */
1451
1452 int
1453 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1454                                    _vect_peel_extended_info *min)
1455 {
1456   vect_peel_info elem = *slot;
1457   int dummy;
1458   unsigned int inside_cost = 0, outside_cost = 0;
1459   stmt_vec_info stmt_info = elem->dr_info->stmt;
1460   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1461   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1462                        epilogue_cost_vec;
1463
1464   prologue_cost_vec.create (2);
1465   body_cost_vec.create (2);
1466   epilogue_cost_vec.create (2);
1467
1468   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1469                                   &outside_cost, &body_cost_vec,
1470                                   &prologue_cost_vec, elem->npeel, false);
1471
1472   body_cost_vec.release ();
1473
1474   outside_cost += vect_get_known_peeling_cost
1475     (loop_vinfo, elem->npeel, &dummy,
1476      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1477      &prologue_cost_vec, &epilogue_cost_vec);
1478
1479   /* Prologue and epilogue costs are added to the target model later.
1480      These costs depend only on the scalar iteration cost, the
1481      number of peeling iterations finally chosen, and the number of
1482      misaligned statements.  So discard the information found here.  */
1483   prologue_cost_vec.release ();
1484   epilogue_cost_vec.release ();
1485
1486   if (inside_cost < min->inside_cost
1487       || (inside_cost == min->inside_cost
1488           && outside_cost < min->outside_cost))
1489     {
1490       min->inside_cost = inside_cost;
1491       min->outside_cost = outside_cost;
1492       min->peel_info.dr_info = elem->dr_info;
1493       min->peel_info.npeel = elem->npeel;
1494       min->peel_info.count = elem->count;
1495     }
1496
1497   return 1;
1498 }
1499
1500
1501 /* Choose best peeling option by traversing peeling hash table and either
1502    choosing an option with the lowest cost (if cost model is enabled) or the
1503    option that aligns as many accesses as possible.  */
1504
1505 static struct _vect_peel_extended_info
1506 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1507                                        loop_vec_info loop_vinfo)
1508 {
1509    struct _vect_peel_extended_info res;
1510
1511    res.peel_info.dr_info = NULL;
1512
1513    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514      {
1515        res.inside_cost = INT_MAX;
1516        res.outside_cost = INT_MAX;
1517        peeling_htab->traverse <_vect_peel_extended_info *,
1518                                vect_peeling_hash_get_lowest_cost> (&res);
1519      }
1520    else
1521      {
1522        res.peel_info.count = 0;
1523        peeling_htab->traverse <_vect_peel_extended_info *,
1524                                vect_peeling_hash_get_most_frequent> (&res);
1525        res.inside_cost = 0;
1526        res.outside_cost = 0;
1527      }
1528
1529    return res;
1530 }
1531
1532 /* Return true if the new peeling NPEEL is supported.  */
1533
1534 static bool
1535 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1536                           unsigned npeel)
1537 {
1538   unsigned i;
1539   struct data_reference *dr = NULL;
1540   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1541   enum dr_alignment_support supportable_dr_alignment;
1542
1543   /* Ensure that all data refs can be vectorized after the peel.  */
1544   FOR_EACH_VEC_ELT (datarefs, i, dr)
1545     {
1546       int save_misalignment;
1547
1548       if (dr == dr0_info->dr)
1549         continue;
1550
1551       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1552       stmt_vec_info stmt_info = dr_info->stmt;
1553       /* For interleaving, only the alignment of the first access
1554          matters.  */
1555       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1557         continue;
1558
1559       /* Strided accesses perform only component accesses, alignment is
1560          irrelevant for them.  */
1561       if (STMT_VINFO_STRIDED_P (stmt_info)
1562           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563         continue;
1564
1565       save_misalignment = DR_MISALIGNMENT (dr_info);
1566       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1567       supportable_dr_alignment
1568         = vect_supportable_dr_alignment (dr_info, false);
1569       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1570
1571       if (!supportable_dr_alignment)
1572         return false;
1573     }
1574
1575   return true;
1576 }
1577
1578 /* Function vect_enhance_data_refs_alignment
1579
1580    This pass will use loop versioning and loop peeling in order to enhance
1581    the alignment of data references in the loop.
1582
1583    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584    original loop is to be vectorized.  Any other loops that are created by
1585    the transformations performed in this pass - are not supposed to be
1586    vectorized.  This restriction will be relaxed.
1587
1588    This pass will require a cost model to guide it whether to apply peeling
1589    or versioning or a combination of the two.  For example, the scheme that
1590    intel uses when given a loop with several memory accesses, is as follows:
1591    choose one memory access ('p') which alignment you want to force by doing
1592    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1593    other accesses are not necessarily aligned, or (2) use loop versioning to
1594    generate one loop in which all accesses are aligned, and another loop in
1595    which only 'p' is necessarily aligned.
1596
1597    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1600
1601    Devising a cost model is the most critical aspect of this work.  It will
1602    guide us on which access to peel for, whether to use loop versioning, how
1603    many versions to create, etc.  The cost model will probably consist of
1604    generic considerations as well as target specific considerations (on
1605    powerpc for example, misaligned stores are more painful than misaligned
1606    loads).
1607
1608    Here are the general steps involved in alignment enhancements:
1609
1610      -- original loop, before alignment analysis:
1611         for (i=0; i<N; i++){
1612           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1613           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1614         }
1615
1616      -- After vect_compute_data_refs_alignment:
1617         for (i=0; i<N; i++){
1618           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1619           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1620         }
1621
1622      -- Possibility 1: we do loop versioning:
1623      if (p is aligned) {
1624         for (i=0; i<N; i++){    # loop 1A
1625           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1626           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1627         }
1628      }
1629      else {
1630         for (i=0; i<N; i++){    # loop 1B
1631           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1632           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1633         }
1634      }
1635
1636      -- Possibility 2: we do loop peeling:
1637      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1638         x = q[i];
1639         p[i] = y;
1640      }
1641      for (i = 3; i < N; i++){   # loop 2A
1642         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1643         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1644      }
1645
1646      -- Possibility 3: combination of loop peeling and versioning:
1647      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1648         x = q[i];
1649         p[i] = y;
1650      }
1651      if (p is aligned) {
1652         for (i = 3; i<N; i++){  # loop 3A
1653           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1654           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1655         }
1656      }
1657      else {
1658         for (i = 3; i<N; i++){  # loop 3B
1659           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1660           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1661         }
1662      }
1663
1664      These loops are later passed to loop_transform to be vectorized.  The
1665      vectorizer will use the alignment information to guide the transformation
1666      (whether to generate regular loads/stores, or with special handling for
1667      misalignment).  */
1668
1669 opt_result
1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1671 {
1672   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674   enum dr_alignment_support supportable_dr_alignment;
1675   dr_vec_info *first_store = NULL;
1676   dr_vec_info *dr0_info = NULL;
1677   struct data_reference *dr;
1678   unsigned int i, j;
1679   bool do_peeling = false;
1680   bool do_versioning = false;
1681   unsigned int npeel = 0;
1682   bool one_misalignment_known = false;
1683   bool one_misalignment_unknown = false;
1684   bool one_dr_unsupportable = false;
1685   dr_vec_info *unsupportable_dr_info = NULL;
1686   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1687   unsigned possible_npeel_number = 1;
1688   tree vectype;
1689   unsigned int mis, same_align_drs_max = 0;
1690   hash_table<peel_info_hasher> peeling_htab (1);
1691
1692   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1693
1694   /* Reset data so we can safely be called multiple times.  */
1695   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1696   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1697
1698   /* While cost model enhancements are expected in the future, the high level
1699      view of the code at this time is as follows:
1700
1701      A) If there is a misaligned access then see if peeling to align
1702         this access can make all data references satisfy
1703         vect_supportable_dr_alignment.  If so, update data structures
1704         as needed and return true.
1705
1706      B) If peeling wasn't possible and there is a data reference with an
1707         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1708         then see if loop versioning checks can be used to make all data
1709         references satisfy vect_supportable_dr_alignment.  If so, update
1710         data structures as needed and return true.
1711
1712      C) If neither peeling nor versioning were successful then return false if
1713         any data reference does not satisfy vect_supportable_dr_alignment.
1714
1715      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1716
1717      Note, Possibility 3 above (which is peeling and versioning together) is not
1718      being done at this time.  */
1719
1720   /* (1) Peeling to force alignment.  */
1721
1722   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1723      Considerations:
1724      + How many accesses will become aligned due to the peeling
1725      - How many accesses will become unaligned due to the peeling,
1726        and the cost of misaligned accesses.
1727      - The cost of peeling (the extra runtime checks, the increase
1728        in code size).  */
1729
1730   FOR_EACH_VEC_ELT (datarefs, i, dr)
1731     {
1732       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1733       stmt_vec_info stmt_info = dr_info->stmt;
1734
1735       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1736         continue;
1737
1738       /* For interleaving, only the alignment of the first access
1739          matters.  */
1740       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1741           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1742         continue;
1743
1744       /* For scatter-gather or invariant accesses there is nothing
1745          to enhance.  */
1746       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1747           || integer_zerop (DR_STEP (dr)))
1748         continue;
1749
1750       /* Strided accesses perform only component accesses, alignment is
1751          irrelevant for them.  */
1752       if (STMT_VINFO_STRIDED_P (stmt_info)
1753           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1754         continue;
1755
1756       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1757       do_peeling = vector_alignment_reachable_p (dr_info);
1758       if (do_peeling)
1759         {
1760           if (known_alignment_for_access_p (dr_info))
1761             {
1762               unsigned int npeel_tmp = 0;
1763               bool negative = tree_int_cst_compare (DR_STEP (dr),
1764                                                     size_zero_node) < 0;
1765
1766               vectype = STMT_VINFO_VECTYPE (stmt_info);
1767               /* If known_alignment_for_access_p then we have set
1768                  DR_MISALIGNMENT which is only done if we know it at compiler
1769                  time, so it is safe to assume target alignment is constant.
1770                */
1771               unsigned int target_align =
1772                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1773               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1774               mis = (negative
1775                      ? DR_MISALIGNMENT (dr_info)
1776                      : -DR_MISALIGNMENT (dr_info));
1777               if (DR_MISALIGNMENT (dr_info) != 0)
1778                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1779
1780               /* For multiple types, it is possible that the bigger type access
1781                  will have more than one peeling option.  E.g., a loop with two
1782                  types: one of size (vector size / 4), and the other one of
1783                  size (vector size / 8).  Vectorization factor will 8.  If both
1784                  accesses are misaligned by 3, the first one needs one scalar
1785                  iteration to be aligned, and the second one needs 5.  But the
1786                  first one will be aligned also by peeling 5 scalar
1787                  iterations, and in that case both accesses will be aligned.
1788                  Hence, except for the immediate peeling amount, we also want
1789                  to try to add full vector size, while we don't exceed
1790                  vectorization factor.
1791                  We do this automatically for cost model, since we calculate
1792                  cost for every peeling option.  */
1793               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1794                 {
1795                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1796                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1797                   possible_npeel_number
1798                     = vect_get_num_vectors (nscalars, vectype);
1799
1800                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1801                      allow peeling NELEMENTS.  */
1802                   if (DR_MISALIGNMENT (dr_info) == 0)
1803                     possible_npeel_number++;
1804                 }
1805
1806               /* Save info about DR in the hash table.  Also include peeling
1807                  amounts according to the explanation above.  */
1808               for (j = 0; j < possible_npeel_number; j++)
1809                 {
1810                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1811                                             dr_info, npeel_tmp);
1812                   npeel_tmp += target_align / dr_size;
1813                 }
1814
1815               one_misalignment_known = true;
1816             }
1817           else
1818             {
1819               /* If we don't know any misalignment values, we prefer
1820                  peeling for data-ref that has the maximum number of data-refs
1821                  with the same alignment, unless the target prefers to align
1822                  stores over load.  */
1823               unsigned same_align_drs
1824                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1825               if (!dr0_info
1826                   || same_align_drs_max < same_align_drs)
1827                 {
1828                   same_align_drs_max = same_align_drs;
1829                   dr0_info = dr_info;
1830                 }
1831               /* For data-refs with the same number of related
1832                  accesses prefer the one where the misalign
1833                  computation will be invariant in the outermost loop.  */
1834               else if (same_align_drs_max == same_align_drs)
1835                 {
1836                   class loop *ivloop0, *ivloop;
1837                   ivloop0 = outermost_invariant_loop_for_expr
1838                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1839                   ivloop = outermost_invariant_loop_for_expr
1840                     (loop, DR_BASE_ADDRESS (dr));
1841                   if ((ivloop && !ivloop0)
1842                       || (ivloop && ivloop0
1843                           && flow_loop_nested_p (ivloop, ivloop0)))
1844                     dr0_info = dr_info;
1845                 }
1846
1847               one_misalignment_unknown = true;
1848
1849               /* Check for data refs with unsupportable alignment that
1850                  can be peeled.  */
1851               if (!supportable_dr_alignment)
1852               {
1853                 one_dr_unsupportable = true;
1854                 unsupportable_dr_info = dr_info;
1855               }
1856
1857               if (!first_store && DR_IS_WRITE (dr))
1858                 first_store = dr_info;
1859             }
1860         }
1861       else
1862         {
1863           if (!aligned_access_p (dr_info))
1864             {
1865               if (dump_enabled_p ())
1866                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                                  "vector alignment may not be reachable\n");
1868               break;
1869             }
1870         }
1871     }
1872
1873   /* Check if we can possibly peel the loop.  */
1874   if (!vect_can_advance_ivs_p (loop_vinfo)
1875       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1876       || loop->inner)
1877     do_peeling = false;
1878
1879   struct _vect_peel_extended_info peel_for_known_alignment;
1880   struct _vect_peel_extended_info peel_for_unknown_alignment;
1881   struct _vect_peel_extended_info best_peel;
1882
1883   peel_for_unknown_alignment.inside_cost = INT_MAX;
1884   peel_for_unknown_alignment.outside_cost = INT_MAX;
1885   peel_for_unknown_alignment.peel_info.count = 0;
1886
1887   if (do_peeling
1888       && one_misalignment_unknown)
1889     {
1890       /* Check if the target requires to prefer stores over loads, i.e., if
1891          misaligned stores are more expensive than misaligned loads (taking
1892          drs with same alignment into account).  */
1893       unsigned int load_inside_cost = 0;
1894       unsigned int load_outside_cost = 0;
1895       unsigned int store_inside_cost = 0;
1896       unsigned int store_outside_cost = 0;
1897       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1898
1899       stmt_vector_for_cost dummy;
1900       dummy.create (2);
1901       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1902                                       &load_inside_cost,
1903                                       &load_outside_cost,
1904                                       &dummy, &dummy, estimated_npeels, true);
1905       dummy.release ();
1906
1907       if (first_store)
1908         {
1909           dummy.create (2);
1910           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1911                                           &store_inside_cost,
1912                                           &store_outside_cost,
1913                                           &dummy, &dummy,
1914                                           estimated_npeels, true);
1915           dummy.release ();
1916         }
1917       else
1918         {
1919           store_inside_cost = INT_MAX;
1920           store_outside_cost = INT_MAX;
1921         }
1922
1923       if (load_inside_cost > store_inside_cost
1924           || (load_inside_cost == store_inside_cost
1925               && load_outside_cost > store_outside_cost))
1926         {
1927           dr0_info = first_store;
1928           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1929           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1930         }
1931       else
1932         {
1933           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1934           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1935         }
1936
1937       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1938       prologue_cost_vec.create (2);
1939       epilogue_cost_vec.create (2);
1940
1941       int dummy2;
1942       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1943         (loop_vinfo, estimated_npeels, &dummy2,
1944          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1945          &prologue_cost_vec, &epilogue_cost_vec);
1946
1947       prologue_cost_vec.release ();
1948       epilogue_cost_vec.release ();
1949
1950       peel_for_unknown_alignment.peel_info.count = 1
1951         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1952     }
1953
1954   peel_for_unknown_alignment.peel_info.npeel = 0;
1955   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1956
1957   best_peel = peel_for_unknown_alignment;
1958
1959   peel_for_known_alignment.inside_cost = INT_MAX;
1960   peel_for_known_alignment.outside_cost = INT_MAX;
1961   peel_for_known_alignment.peel_info.count = 0;
1962   peel_for_known_alignment.peel_info.dr_info = NULL;
1963
1964   if (do_peeling && one_misalignment_known)
1965     {
1966       /* Peeling is possible, but there is no data access that is not supported
1967          unless aligned.  So we try to choose the best possible peeling from
1968          the hash table.  */
1969       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1970         (&peeling_htab, loop_vinfo);
1971     }
1972
1973   /* Compare costs of peeling for known and unknown alignment. */
1974   if (peel_for_known_alignment.peel_info.dr_info != NULL
1975       && peel_for_unknown_alignment.inside_cost
1976       >= peel_for_known_alignment.inside_cost)
1977     {
1978       best_peel = peel_for_known_alignment;
1979
1980       /* If the best peeling for known alignment has NPEEL == 0, perform no
1981          peeling at all except if there is an unsupportable dr that we can
1982          align.  */
1983       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1984         do_peeling = false;
1985     }
1986
1987   /* If there is an unsupportable data ref, prefer this over all choices so far
1988      since we'd have to discard a chosen peeling except when it accidentally
1989      aligned the unsupportable data ref.  */
1990   if (one_dr_unsupportable)
1991     dr0_info = unsupportable_dr_info;
1992   else if (do_peeling)
1993     {
1994       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1995          TODO: Use nopeel_outside_cost or get rid of it?  */
1996       unsigned nopeel_inside_cost = 0;
1997       unsigned nopeel_outside_cost = 0;
1998
1999       stmt_vector_for_cost dummy;
2000       dummy.create (2);
2001       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2002                                       &nopeel_outside_cost, &dummy, &dummy,
2003                                       0, false);
2004       dummy.release ();
2005
2006       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2007          costs will be recorded.  */
2008       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2009       prologue_cost_vec.create (2);
2010       epilogue_cost_vec.create (2);
2011
2012       int dummy2;
2013       nopeel_outside_cost += vect_get_known_peeling_cost
2014         (loop_vinfo, 0, &dummy2,
2015          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2016          &prologue_cost_vec, &epilogue_cost_vec);
2017
2018       prologue_cost_vec.release ();
2019       epilogue_cost_vec.release ();
2020
2021       npeel = best_peel.peel_info.npeel;
2022       dr0_info = best_peel.peel_info.dr_info;
2023
2024       /* If no peeling is not more expensive than the best peeling we
2025          have so far, don't perform any peeling.  */
2026       if (nopeel_inside_cost <= best_peel.inside_cost)
2027         do_peeling = false;
2028     }
2029
2030   if (do_peeling)
2031     {
2032       stmt_vec_info stmt_info = dr0_info->stmt;
2033       vectype = STMT_VINFO_VECTYPE (stmt_info);
2034
2035       if (known_alignment_for_access_p (dr0_info))
2036         {
2037           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2038                                                 size_zero_node) < 0;
2039           if (!npeel)
2040             {
2041               /* Since it's known at compile time, compute the number of
2042                  iterations in the peeled loop (the peeling factor) for use in
2043                  updating DR_MISALIGNMENT values.  The peeling factor is the
2044                  vectorization factor minus the misalignment as an element
2045                  count.  */
2046               mis = (negative
2047                      ? DR_MISALIGNMENT (dr0_info)
2048                      : -DR_MISALIGNMENT (dr0_info));
2049               /* If known_alignment_for_access_p then we have set
2050                  DR_MISALIGNMENT which is only done if we know it at compiler
2051                  time, so it is safe to assume target alignment is constant.
2052                */
2053               unsigned int target_align =
2054                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2055               npeel = ((mis & (target_align - 1))
2056                        / vect_get_scalar_dr_size (dr0_info));
2057             }
2058
2059           /* For interleaved data access every iteration accesses all the
2060              members of the group, therefore we divide the number of iterations
2061              by the group size.  */
2062           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2063             npeel /= DR_GROUP_SIZE (stmt_info);
2064
2065           if (dump_enabled_p ())
2066             dump_printf_loc (MSG_NOTE, vect_location,
2067                              "Try peeling by %d\n", npeel);
2068         }
2069
2070       /* Ensure that all datarefs can be vectorized after the peel.  */
2071       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2072         do_peeling = false;
2073
2074       /* Check if all datarefs are supportable and log.  */
2075       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2076         {
2077           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2078           if (!stat)
2079             do_peeling = false;
2080           else
2081             return stat;
2082         }
2083
2084       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2085       if (do_peeling)
2086         {
2087           unsigned max_allowed_peel
2088             = param_vect_max_peeling_for_alignment;
2089           if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2090             max_allowed_peel = 0;
2091           if (max_allowed_peel != (unsigned)-1)
2092             {
2093               unsigned max_peel = npeel;
2094               if (max_peel == 0)
2095                 {
2096                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2097                   unsigned HOST_WIDE_INT target_align_c;
2098                   if (target_align.is_constant (&target_align_c))
2099                     max_peel =
2100                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2101                   else
2102                     {
2103                       do_peeling = false;
2104                       if (dump_enabled_p ())
2105                         dump_printf_loc (MSG_NOTE, vect_location,
2106                           "Disable peeling, max peels set and vector"
2107                           " alignment unknown\n");
2108                     }
2109                 }
2110               if (max_peel > max_allowed_peel)
2111                 {
2112                   do_peeling = false;
2113                   if (dump_enabled_p ())
2114                     dump_printf_loc (MSG_NOTE, vect_location,
2115                         "Disable peeling, max peels reached: %d\n", max_peel);
2116                 }
2117             }
2118         }
2119
2120       /* Cost model #2 - if peeling may result in a remaining loop not
2121          iterating enough to be vectorized then do not peel.  Since this
2122          is a cost heuristic rather than a correctness decision, use the
2123          most likely runtime value for variable vectorization factors.  */
2124       if (do_peeling
2125           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2126         {
2127           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2128           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2129           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2130               < assumed_vf + max_peel)
2131             do_peeling = false;
2132         }
2133
2134       if (do_peeling)
2135         {
2136           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2137              If the misalignment of DR_i is identical to that of dr0 then set
2138              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2139              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2140              by the peeling factor times the element size of DR_i (MOD the
2141              vectorization factor times the size).  Otherwise, the
2142              misalignment of DR_i must be set to unknown.  */
2143           FOR_EACH_VEC_ELT (datarefs, i, dr)
2144             if (dr != dr0_info->dr)
2145               {
2146                 /* Strided accesses perform only component accesses, alignment
2147                    is irrelevant for them.  */
2148                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2149                 stmt_info = dr_info->stmt;
2150                 if (STMT_VINFO_STRIDED_P (stmt_info)
2151                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2152                   continue;
2153
2154                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2155               }
2156
2157           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2158           if (npeel)
2159             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2160           else
2161             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2162               = DR_MISALIGNMENT (dr0_info);
2163           SET_DR_MISALIGNMENT (dr0_info, 0);
2164           if (dump_enabled_p ())
2165             {
2166               dump_printf_loc (MSG_NOTE, vect_location,
2167                                "Alignment of access forced using peeling.\n");
2168               dump_printf_loc (MSG_NOTE, vect_location,
2169                                "Peeling for alignment will be applied.\n");
2170             }
2171
2172           /* The inside-loop cost will be accounted for in vectorizable_load
2173              and vectorizable_store correctly with adjusted alignments.
2174              Drop the body_cst_vec on the floor here.  */
2175           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2176           gcc_assert (stat);
2177           return stat;
2178         }
2179     }
2180
2181   /* (2) Versioning to force alignment.  */
2182
2183   /* Try versioning if:
2184      1) optimize loop for speed and the cost-model is not cheap
2185      2) there is at least one unsupported misaligned data ref with an unknown
2186         misalignment, and
2187      3) all misaligned data refs with a known misalignment are supported, and
2188      4) the number of runtime alignment checks is within reason.  */
2189
2190   do_versioning
2191     = (optimize_loop_nest_for_speed_p (loop)
2192        && !loop->inner /* FORNOW */
2193        && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2194
2195   if (do_versioning)
2196     {
2197       FOR_EACH_VEC_ELT (datarefs, i, dr)
2198         {
2199           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2200           stmt_vec_info stmt_info = dr_info->stmt;
2201
2202           /* For interleaving, only the alignment of the first access
2203              matters.  */
2204           if (aligned_access_p (dr_info)
2205               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2206                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2207             continue;
2208
2209           if (STMT_VINFO_STRIDED_P (stmt_info))
2210             {
2211               /* Strided loads perform only component accesses, alignment is
2212                  irrelevant for them.  */
2213               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2214                 continue;
2215               do_versioning = false;
2216               break;
2217             }
2218
2219           supportable_dr_alignment
2220             = vect_supportable_dr_alignment (dr_info, false);
2221
2222           if (!supportable_dr_alignment)
2223             {
2224               int mask;
2225               tree vectype;
2226
2227               if (known_alignment_for_access_p (dr_info)
2228                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2229                   >= (unsigned) param_vect_max_version_for_alignment_checks)
2230                 {
2231                   do_versioning = false;
2232                   break;
2233                 }
2234
2235               vectype = STMT_VINFO_VECTYPE (stmt_info);
2236               gcc_assert (vectype);
2237
2238               /* At present we don't support versioning for alignment
2239                  with variable VF, since there's no guarantee that the
2240                  VF is a power of two.  We could relax this if we added
2241                  a way of enforcing a power-of-two size.  */
2242               unsigned HOST_WIDE_INT size;
2243               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2244                 {
2245                   do_versioning = false;
2246                   break;
2247                 }
2248
2249               /* Forcing alignment in the first iteration is no good if
2250                  we don't keep it across iterations.  For now, just disable
2251                  versioning in this case.
2252                  ?? We could actually unroll the loop to achieve the required
2253                  overall step alignment, and forcing the alignment could be
2254                  done by doing some iterations of the non-vectorized loop.  */
2255               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2256                                * DR_STEP_ALIGNMENT (dr),
2257                                DR_TARGET_ALIGNMENT (dr_info)))
2258                 {
2259                   do_versioning = false;
2260                   break;
2261                 }
2262
2263               /* The rightmost bits of an aligned address must be zeros.
2264                  Construct the mask needed for this test.  For example,
2265                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2266                  mask must be 15 = 0xf. */
2267               mask = size - 1;
2268
2269               /* FORNOW: use the same mask to test all potentially unaligned
2270                  references in the loop.  */
2271               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2272                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2273                 {
2274                   do_versioning = false;
2275                   break;
2276                 }
2277
2278               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2279               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2280             }
2281         }
2282
2283       /* Versioning requires at least one misaligned data reference.  */
2284       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2285         do_versioning = false;
2286       else if (!do_versioning)
2287         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2288     }
2289
2290   if (do_versioning)
2291     {
2292       vec<stmt_vec_info> may_misalign_stmts
2293         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2294       stmt_vec_info stmt_info;
2295
2296       /* It can now be assumed that the data references in the statements
2297          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2298          of the loop being vectorized.  */
2299       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2300         {
2301           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2302           SET_DR_MISALIGNMENT (dr_info, 0);
2303           if (dump_enabled_p ())
2304             dump_printf_loc (MSG_NOTE, vect_location,
2305                              "Alignment of access forced using versioning.\n");
2306         }
2307
2308       if (dump_enabled_p ())
2309         dump_printf_loc (MSG_NOTE, vect_location,
2310                          "Versioning for alignment will be applied.\n");
2311
2312       /* Peeling and versioning can't be done together at this time.  */
2313       gcc_assert (! (do_peeling && do_versioning));
2314
2315       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2316       gcc_assert (stat);
2317       return stat;
2318     }
2319
2320   /* This point is reached if neither peeling nor versioning is being done.  */
2321   gcc_assert (! (do_peeling || do_versioning));
2322
2323   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2324   return stat;
2325 }
2326
2327
2328 /* Function vect_find_same_alignment_drs.
2329
2330    Update group and alignment relations in VINFO according to the chosen
2331    vectorization factor.  */
2332
2333 static void
2334 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2335 {
2336   struct data_reference *dra = DDR_A (ddr);
2337   struct data_reference *drb = DDR_B (ddr);
2338   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2339   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2340   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2341   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2342
2343   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2344     return;
2345
2346   if (dra == drb)
2347     return;
2348
2349   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2350       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2351     return;
2352
2353   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2354       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2355       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2356     return;
2357
2358   /* Two references with distance zero have the same alignment.  */
2359   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2360                           - wi::to_poly_offset (DR_INIT (drb)));
2361   if (maybe_ne (diff, 0))
2362     {
2363       /* Get the wider of the two alignments.  */
2364       poly_uint64 align_a =
2365         exact_div (vect_calculate_target_alignment (dr_info_a),
2366                    BITS_PER_UNIT);
2367       poly_uint64 align_b =
2368         exact_div (vect_calculate_target_alignment (dr_info_b),
2369                    BITS_PER_UNIT);
2370       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2371       if (!align_a.is_constant (&align_a_c)
2372           || !align_b.is_constant (&align_b_c))
2373         return;
2374
2375       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2376
2377       /* Require the gap to be a multiple of the larger vector alignment.  */
2378       if (!multiple_p (diff, max_align))
2379         return;
2380     }
2381
2382   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2383   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2384   if (dump_enabled_p ())
2385     dump_printf_loc (MSG_NOTE, vect_location,
2386                      "accesses have the same alignment: %T and %T\n",
2387                      DR_REF (dra), DR_REF (drb));
2388 }
2389
2390
2391 /* Function vect_analyze_data_refs_alignment
2392
2393    Analyze the alignment of the data-references in the loop.
2394    Return FALSE if a data reference is found that cannot be vectorized.  */
2395
2396 opt_result
2397 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2398 {
2399   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2400
2401   /* Mark groups of data references with same alignment using
2402      data dependence information.  */
2403   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2404   struct data_dependence_relation *ddr;
2405   unsigned int i;
2406
2407   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2408     vect_find_same_alignment_drs (vinfo, ddr);
2409
2410   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2411   struct data_reference *dr;
2412
2413   vect_record_base_alignments (vinfo);
2414   FOR_EACH_VEC_ELT (datarefs, i, dr)
2415     {
2416       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2417       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2418         vect_compute_data_ref_alignment (dr_info);
2419     }
2420
2421   return opt_result::success ();
2422 }
2423
2424
2425 /* Analyze alignment of DRs of stmts in NODE.  */
2426
2427 static bool
2428 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2429 {
2430   /* We vectorize from the first scalar stmt in the node unless
2431      the node is permuted in which case we start from the first
2432      element in the group.  */
2433   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2434   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2435   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2436     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2437
2438   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2439   vect_compute_data_ref_alignment (dr_info);
2440   /* For creating the data-ref pointer we need alignment of the
2441      first element anyway.  */
2442   if (dr_info != first_dr_info)
2443     vect_compute_data_ref_alignment (first_dr_info);
2444   if (! verify_data_ref_alignment (dr_info))
2445     {
2446       if (dump_enabled_p ())
2447         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2448                          "not vectorized: bad data alignment in basic "
2449                          "block.\n");
2450       return false;
2451     }
2452
2453   return true;
2454 }
2455
2456 /* Function vect_slp_analyze_instance_alignment
2457
2458    Analyze the alignment of the data-references in the SLP instance.
2459    Return FALSE if a data reference is found that cannot be vectorized.  */
2460
2461 bool
2462 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2463 {
2464   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2465
2466   slp_tree node;
2467   unsigned i;
2468   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2469     if (! vect_slp_analyze_and_verify_node_alignment (node))
2470       return false;
2471
2472   node = SLP_INSTANCE_TREE (instance);
2473   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2474       && ! vect_slp_analyze_and_verify_node_alignment
2475              (SLP_INSTANCE_TREE (instance)))
2476     return false;
2477
2478   return true;
2479 }
2480
2481
2482 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2483    accesses of legal size, step, etc.  Detect gaps, single element
2484    interleaving, and other special cases. Set grouped access info.
2485    Collect groups of strided stores for further use in SLP analysis.
2486    Worker for vect_analyze_group_access.  */
2487
2488 static bool
2489 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2490 {
2491   data_reference *dr = dr_info->dr;
2492   tree step = DR_STEP (dr);
2493   tree scalar_type = TREE_TYPE (DR_REF (dr));
2494   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2495   stmt_vec_info stmt_info = dr_info->stmt;
2496   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2497   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2498   HOST_WIDE_INT dr_step = -1;
2499   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2500   bool slp_impossible = false;
2501
2502   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2503      size of the interleaving group (including gaps).  */
2504   if (tree_fits_shwi_p (step))
2505     {
2506       dr_step = tree_to_shwi (step);
2507       /* Check that STEP is a multiple of type size.  Otherwise there is
2508          a non-element-sized gap at the end of the group which we
2509          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2510          ???  As we can handle non-constant step fine here we should
2511          simply remove uses of DR_GROUP_GAP between the last and first
2512          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2513          simply not include that gap.  */
2514       if ((dr_step % type_size) != 0)
2515         {
2516           if (dump_enabled_p ())
2517             dump_printf_loc (MSG_NOTE, vect_location,
2518                              "Step %T is not a multiple of the element size"
2519                              " for %T\n",
2520                              step, DR_REF (dr));
2521           return false;
2522         }
2523       groupsize = absu_hwi (dr_step) / type_size;
2524     }
2525   else
2526     groupsize = 0;
2527
2528   /* Not consecutive access is possible only if it is a part of interleaving.  */
2529   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2530     {
2531       /* Check if it this DR is a part of interleaving, and is a single
2532          element of the group that is accessed in the loop.  */
2533
2534       /* Gaps are supported only for loads. STEP must be a multiple of the type
2535          size.  */
2536       if (DR_IS_READ (dr)
2537           && (dr_step % type_size) == 0
2538           && groupsize > 0)
2539         {
2540           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2541           DR_GROUP_SIZE (stmt_info) = groupsize;
2542           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2543           if (dump_enabled_p ())
2544             dump_printf_loc (MSG_NOTE, vect_location,
2545                              "Detected single element interleaving %T"
2546                              " step %T\n",
2547                              DR_REF (dr), step);
2548
2549           return true;
2550         }
2551
2552       if (dump_enabled_p ())
2553         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554                          "not consecutive access %G", stmt_info->stmt);
2555
2556       if (bb_vinfo)
2557         {
2558           /* Mark the statement as unvectorizable.  */
2559           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2560           return true;
2561         }
2562
2563       if (dump_enabled_p ())
2564         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2565       STMT_VINFO_STRIDED_P (stmt_info) = true;
2566       return true;
2567     }
2568
2569   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2570     {
2571       /* First stmt in the interleaving chain. Check the chain.  */
2572       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2573       struct data_reference *data_ref = dr;
2574       unsigned int count = 1;
2575       tree prev_init = DR_INIT (data_ref);
2576       HOST_WIDE_INT diff, gaps = 0;
2577
2578       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2579       while (next)
2580         {
2581           /* We never have the same DR multiple times.  */
2582           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2583                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2584
2585           data_ref = STMT_VINFO_DATA_REF (next);
2586
2587           /* All group members have the same STEP by construction.  */
2588           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2589
2590           /* Check that the distance between two accesses is equal to the type
2591              size. Otherwise, we have gaps.  */
2592           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2593                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2594           if (diff != 1)
2595             {
2596               /* FORNOW: SLP of accesses with gaps is not supported.  */
2597               slp_impossible = true;
2598               if (DR_IS_WRITE (data_ref))
2599                 {
2600                   if (dump_enabled_p ())
2601                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2602                                      "interleaved store with gaps\n");
2603                   return false;
2604                 }
2605
2606               gaps += diff - 1;
2607             }
2608
2609           last_accessed_element += diff;
2610
2611           /* Store the gap from the previous member of the group. If there is no
2612              gap in the access, DR_GROUP_GAP is always 1.  */
2613           DR_GROUP_GAP (next) = diff;
2614
2615           prev_init = DR_INIT (data_ref);
2616           next = DR_GROUP_NEXT_ELEMENT (next);
2617           /* Count the number of data-refs in the chain.  */
2618           count++;
2619         }
2620
2621       if (groupsize == 0)
2622         groupsize = count + gaps;
2623
2624       /* This could be UINT_MAX but as we are generating code in a very
2625          inefficient way we have to cap earlier.  See PR78699 for example.  */
2626       if (groupsize > 4096)
2627         {
2628           if (dump_enabled_p ())
2629             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2630                              "group is too large\n");
2631           return false;
2632         }
2633
2634       /* Check that the size of the interleaving is equal to count for stores,
2635          i.e., that there are no gaps.  */
2636       if (groupsize != count
2637           && !DR_IS_READ (dr))
2638         {
2639           groupsize = count;
2640           STMT_VINFO_STRIDED_P (stmt_info) = true;
2641         }
2642
2643       /* If there is a gap after the last load in the group it is the
2644          difference between the groupsize and the last accessed
2645          element.
2646          When there is no gap, this difference should be 0.  */
2647       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2648
2649       DR_GROUP_SIZE (stmt_info) = groupsize;
2650       if (dump_enabled_p ())
2651         {
2652           dump_printf_loc (MSG_NOTE, vect_location,
2653                            "Detected interleaving ");
2654           if (DR_IS_READ (dr))
2655             dump_printf (MSG_NOTE, "load ");
2656           else if (STMT_VINFO_STRIDED_P (stmt_info))
2657             dump_printf (MSG_NOTE, "strided store ");
2658           else
2659             dump_printf (MSG_NOTE, "store ");
2660           dump_printf (MSG_NOTE, "of size %u\n",
2661                        (unsigned)groupsize);
2662           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2663           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2664           while (next)
2665             {
2666               if (DR_GROUP_GAP (next) != 1)
2667                 dump_printf_loc (MSG_NOTE, vect_location,
2668                                  "\t<gap of %d elements>\n",
2669                                  DR_GROUP_GAP (next) - 1);
2670               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2671               next = DR_GROUP_NEXT_ELEMENT (next);
2672             }
2673           if (DR_GROUP_GAP (stmt_info) != 0)
2674             dump_printf_loc (MSG_NOTE, vect_location,
2675                              "\t<gap of %d elements>\n",
2676                              DR_GROUP_GAP (stmt_info));
2677         }
2678
2679       /* SLP: create an SLP data structure for every interleaving group of
2680          stores for further analysis in vect_analyse_slp.  */
2681       if (DR_IS_WRITE (dr) && !slp_impossible)
2682         {
2683           if (loop_vinfo)
2684             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2685           if (bb_vinfo)
2686             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2687         }
2688     }
2689
2690   return true;
2691 }
2692
2693 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2694    accesses of legal size, step, etc.  Detect gaps, single element
2695    interleaving, and other special cases. Set grouped access info.
2696    Collect groups of strided stores for further use in SLP analysis.  */
2697
2698 static bool
2699 vect_analyze_group_access (dr_vec_info *dr_info)
2700 {
2701   if (!vect_analyze_group_access_1 (dr_info))
2702     {
2703       /* Dissolve the group if present.  */
2704       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2705       while (stmt_info)
2706         {
2707           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2708           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2709           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2710           stmt_info = next;
2711         }
2712       return false;
2713     }
2714   return true;
2715 }
2716
2717 /* Analyze the access pattern of the data-reference DR_INFO.
2718    In case of non-consecutive accesses call vect_analyze_group_access() to
2719    analyze groups of accesses.  */
2720
2721 static bool
2722 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2723 {
2724   data_reference *dr = dr_info->dr;
2725   tree step = DR_STEP (dr);
2726   tree scalar_type = TREE_TYPE (DR_REF (dr));
2727   stmt_vec_info stmt_info = dr_info->stmt;
2728   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2729   class loop *loop = NULL;
2730
2731   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2732     return true;
2733
2734   if (loop_vinfo)
2735     loop = LOOP_VINFO_LOOP (loop_vinfo);
2736
2737   if (loop_vinfo && !step)
2738     {
2739       if (dump_enabled_p ())
2740         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2741                          "bad data-ref access in loop\n");
2742       return false;
2743     }
2744
2745   /* Allow loads with zero step in inner-loop vectorization.  */
2746   if (loop_vinfo && integer_zerop (step))
2747     {
2748       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2749       if (!nested_in_vect_loop_p (loop, stmt_info))
2750         return DR_IS_READ (dr);
2751       /* Allow references with zero step for outer loops marked
2752          with pragma omp simd only - it guarantees absence of
2753          loop-carried dependencies between inner loop iterations.  */
2754       if (loop->safelen < 2)
2755         {
2756           if (dump_enabled_p ())
2757             dump_printf_loc (MSG_NOTE, vect_location,
2758                              "zero step in inner loop of nest\n");
2759           return false;
2760         }
2761     }
2762
2763   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2764     {
2765       /* Interleaved accesses are not yet supported within outer-loop
2766         vectorization for references in the inner-loop.  */
2767       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2768
2769       /* For the rest of the analysis we use the outer-loop step.  */
2770       step = STMT_VINFO_DR_STEP (stmt_info);
2771       if (integer_zerop (step))
2772         {
2773           if (dump_enabled_p ())
2774             dump_printf_loc (MSG_NOTE, vect_location,
2775                              "zero step in outer loop.\n");
2776           return DR_IS_READ (dr);
2777         }
2778     }
2779
2780   /* Consecutive?  */
2781   if (TREE_CODE (step) == INTEGER_CST)
2782     {
2783       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2784       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2785           || (dr_step < 0
2786               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2787         {
2788           /* Mark that it is not interleaving.  */
2789           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2790           return true;
2791         }
2792     }
2793
2794   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2795     {
2796       if (dump_enabled_p ())
2797         dump_printf_loc (MSG_NOTE, vect_location,
2798                          "grouped access in outer loop.\n");
2799       return false;
2800     }
2801
2802
2803   /* Assume this is a DR handled by non-constant strided load case.  */
2804   if (TREE_CODE (step) != INTEGER_CST)
2805     return (STMT_VINFO_STRIDED_P (stmt_info)
2806             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2807                 || vect_analyze_group_access (dr_info)));
2808
2809   /* Not consecutive access - check if it's a part of interleaving group.  */
2810   return vect_analyze_group_access (dr_info);
2811 }
2812
2813 /* Compare two data-references DRA and DRB to group them into chunks
2814    suitable for grouping.  */
2815
2816 static int
2817 dr_group_sort_cmp (const void *dra_, const void *drb_)
2818 {
2819   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2820   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2821   int cmp;
2822
2823   /* Stabilize sort.  */
2824   if (dra == drb)
2825     return 0;
2826
2827   /* DRs in different loops never belong to the same group.  */
2828   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2829   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2830   if (loopa != loopb)
2831     return loopa->num < loopb->num ? -1 : 1;
2832
2833   /* Ordering of DRs according to base.  */
2834   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2835                                DR_BASE_ADDRESS (drb));
2836   if (cmp != 0)
2837     return cmp;
2838
2839   /* And according to DR_OFFSET.  */
2840   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2841   if (cmp != 0)
2842     return cmp;
2843
2844   /* Put reads before writes.  */
2845   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2846     return DR_IS_READ (dra) ? -1 : 1;
2847
2848   /* Then sort after access size.  */
2849   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2850                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2851   if (cmp != 0)
2852     return cmp;
2853
2854   /* And after step.  */
2855   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2856   if (cmp != 0)
2857     return cmp;
2858
2859   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2860   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2861   if (cmp == 0)
2862     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2863   return cmp;
2864 }
2865
2866 /* If OP is the result of a conversion, return the unconverted value,
2867    otherwise return null.  */
2868
2869 static tree
2870 strip_conversion (tree op)
2871 {
2872   if (TREE_CODE (op) != SSA_NAME)
2873     return NULL_TREE;
2874   gimple *stmt = SSA_NAME_DEF_STMT (op);
2875   if (!is_gimple_assign (stmt)
2876       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2877     return NULL_TREE;
2878   return gimple_assign_rhs1 (stmt);
2879 }
2880
2881 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2882    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2883    be grouped in SLP mode.  */
2884
2885 static bool
2886 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2887                    bool allow_slp_p)
2888 {
2889   if (gimple_assign_single_p (stmt1_info->stmt))
2890     return gimple_assign_single_p (stmt2_info->stmt);
2891
2892   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2893   if (call1 && gimple_call_internal_p (call1))
2894     {
2895       /* Check for two masked loads or two masked stores.  */
2896       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2897       if (!call2 || !gimple_call_internal_p (call2))
2898         return false;
2899       internal_fn ifn = gimple_call_internal_fn (call1);
2900       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2901         return false;
2902       if (ifn != gimple_call_internal_fn (call2))
2903         return false;
2904
2905       /* Check that the masks are the same.  Cope with casts of masks,
2906          like those created by build_mask_conversion.  */
2907       tree mask1 = gimple_call_arg (call1, 2);
2908       tree mask2 = gimple_call_arg (call2, 2);
2909       if (!operand_equal_p (mask1, mask2, 0)
2910           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2911         {
2912           mask1 = strip_conversion (mask1);
2913           if (!mask1)
2914             return false;
2915           mask2 = strip_conversion (mask2);
2916           if (!mask2)
2917             return false;
2918           if (!operand_equal_p (mask1, mask2, 0))
2919             return false;
2920         }
2921       return true;
2922     }
2923
2924   return false;
2925 }
2926
2927 /* Function vect_analyze_data_ref_accesses.
2928
2929    Analyze the access pattern of all the data references in the loop.
2930
2931    FORNOW: the only access pattern that is considered vectorizable is a
2932            simple step 1 (consecutive) access.
2933
2934    FORNOW: handle only arrays and pointer accesses.  */
2935
2936 opt_result
2937 vect_analyze_data_ref_accesses (vec_info *vinfo)
2938 {
2939   unsigned int i;
2940   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2941   struct data_reference *dr;
2942
2943   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2944
2945   if (datarefs.is_empty ())
2946     return opt_result::success ();
2947
2948   /* Sort the array of datarefs to make building the interleaving chains
2949      linear.  Don't modify the original vector's order, it is needed for
2950      determining what dependencies are reversed.  */
2951   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2952   datarefs_copy.qsort (dr_group_sort_cmp);
2953   hash_set<stmt_vec_info> to_fixup;
2954
2955   /* Build the interleaving chains.  */
2956   for (i = 0; i < datarefs_copy.length () - 1;)
2957     {
2958       data_reference_p dra = datarefs_copy[i];
2959       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2960       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2961       stmt_vec_info lastinfo = NULL;
2962       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2963           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2964         {
2965           ++i;
2966           continue;
2967         }
2968       for (i = i + 1; i < datarefs_copy.length (); ++i)
2969         {
2970           data_reference_p drb = datarefs_copy[i];
2971           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2972           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2973           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2974               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2975             break;
2976
2977           /* ???  Imperfect sorting (non-compatible types, non-modulo
2978              accesses, same accesses) can lead to a group to be artificially
2979              split here as we don't just skip over those.  If it really
2980              matters we can push those to a worklist and re-iterate
2981              over them.  The we can just skip ahead to the next DR here.  */
2982
2983           /* DRs in a different loop should not be put into the same
2984              interleaving group.  */
2985           if (gimple_bb (DR_STMT (dra))->loop_father
2986               != gimple_bb (DR_STMT (drb))->loop_father)
2987             break;
2988
2989           /* Check that the data-refs have same first location (except init)
2990              and they are both either store or load (not load and store,
2991              not masked loads or stores).  */
2992           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2993               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2994                                         DR_BASE_ADDRESS (drb)) != 0
2995               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2996               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2997             break;
2998
2999           /* Check that the data-refs have the same constant size.  */
3000           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3001           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3002           if (!tree_fits_uhwi_p (sza)
3003               || !tree_fits_uhwi_p (szb)
3004               || !tree_int_cst_equal (sza, szb))
3005             break;
3006
3007           /* Check that the data-refs have the same step.  */
3008           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3009             break;
3010
3011           /* Check the types are compatible.
3012              ???  We don't distinguish this during sorting.  */
3013           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3014                                    TREE_TYPE (DR_REF (drb))))
3015             break;
3016
3017           /* Check that the DR_INITs are compile-time constants.  */
3018           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3019               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3020             break;
3021
3022           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3023              just hold extra information.  */
3024           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3025               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3026               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3027             break;
3028
3029           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3030           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3031           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3032           HOST_WIDE_INT init_prev
3033             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3034           gcc_assert (init_a <= init_b
3035                       && init_a <= init_prev
3036                       && init_prev <= init_b);
3037
3038           /* Do not place the same access in the interleaving chain twice.  */
3039           if (init_b == init_prev)
3040             {
3041               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3042                           < gimple_uid (DR_STMT (drb)));
3043               /* Simply link in duplicates and fix up the chain below.  */
3044             }
3045           else
3046             {
3047               /* If init_b == init_a + the size of the type * k, we have an
3048                  interleaving, and DRA is accessed before DRB.  */
3049               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3050               if (type_size_a == 0
3051                   || (init_b - init_a) % type_size_a != 0)
3052                 break;
3053
3054               /* If we have a store, the accesses are adjacent.  This splits
3055                  groups into chunks we support (we don't support vectorization
3056                  of stores with gaps).  */
3057               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3058                 break;
3059
3060               /* If the step (if not zero or non-constant) is greater than the
3061                  difference between data-refs' inits this splits groups into
3062                  suitable sizes.  */
3063               if (tree_fits_shwi_p (DR_STEP (dra)))
3064                 {
3065                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3066                   if (step != 0 && step <= (init_b - init_a))
3067                     break;
3068                 }
3069             }
3070
3071           if (dump_enabled_p ())
3072             dump_printf_loc (MSG_NOTE, vect_location,
3073                              DR_IS_READ (dra)
3074                              ? "Detected interleaving load %T and %T\n"
3075                              : "Detected interleaving store %T and %T\n",
3076                              DR_REF (dra), DR_REF (drb));
3077
3078           /* Link the found element into the group list.  */
3079           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3080             {
3081               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3082               lastinfo = stmtinfo_a;
3083             }
3084           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3085           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3086           lastinfo = stmtinfo_b;
3087
3088           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3089             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3090
3091           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3092             dump_printf_loc (MSG_NOTE, vect_location,
3093                              "Load suitable for SLP vectorization only.\n");
3094
3095           if (init_b == init_prev
3096               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3097               && dump_enabled_p ())
3098             dump_printf_loc (MSG_NOTE, vect_location,
3099                              "Queuing group with duplicate access for fixup\n");
3100         }
3101     }
3102
3103   /* Fixup groups with duplicate entries by splitting it.  */
3104   while (1)
3105     {
3106       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3107       if (!(it != to_fixup.end ()))
3108         break;
3109       stmt_vec_info grp = *it;
3110       to_fixup.remove (grp);
3111
3112       /* Find the earliest duplicate group member.  */
3113       unsigned first_duplicate = -1u;
3114       stmt_vec_info next, g = grp;
3115       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3116         {
3117           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3118                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3119               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3120             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3121           g = next;
3122         }
3123       if (first_duplicate == -1U)
3124         continue;
3125
3126       /* Then move all stmts after the first duplicate to a new group.
3127          Note this is a heuristic but one with the property that *it
3128          is fixed up completely.  */
3129       g = grp;
3130       stmt_vec_info newgroup = NULL, ng = grp;
3131       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3132         {
3133           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3134             {
3135               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3136               if (!newgroup)
3137                 newgroup = next;
3138               else
3139                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3140               ng = next;
3141               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3142             }
3143           else
3144             g = DR_GROUP_NEXT_ELEMENT (g);
3145         }
3146       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3147
3148       /* Fixup the new group which still may contain duplicates.  */
3149       to_fixup.add (newgroup);
3150     }
3151
3152   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3153     {
3154       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3155       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3156           && !vect_analyze_data_ref_access (dr_info))
3157         {
3158           if (dump_enabled_p ())
3159             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3160                              "not vectorized: complicated access pattern.\n");
3161
3162           if (is_a <bb_vec_info> (vinfo))
3163             {
3164               /* Mark the statement as not vectorizable.  */
3165               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3166               continue;
3167             }
3168           else
3169             {
3170               datarefs_copy.release ();
3171               return opt_result::failure_at (dr_info->stmt->stmt,
3172                                              "not vectorized:"
3173                                              " complicated access pattern.\n");
3174             }
3175         }
3176     }
3177
3178   datarefs_copy.release ();
3179   return opt_result::success ();
3180 }
3181
3182 /* Function vect_vfa_segment_size.
3183
3184    Input:
3185      DR_INFO: The data reference.
3186      LENGTH_FACTOR: segment length to consider.
3187
3188    Return a value suitable for the dr_with_seg_len::seg_len field.
3189    This is the "distance travelled" by the pointer from the first
3190    iteration in the segment to the last.  Note that it does not include
3191    the size of the access; in effect it only describes the first byte.  */
3192
3193 static tree
3194 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3195 {
3196   length_factor = size_binop (MINUS_EXPR,
3197                               fold_convert (sizetype, length_factor),
3198                               size_one_node);
3199   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3200                      length_factor);
3201 }
3202
3203 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3204    gives the worst-case number of bytes covered by the segment.  */
3205
3206 static unsigned HOST_WIDE_INT
3207 vect_vfa_access_size (dr_vec_info *dr_info)
3208 {
3209   stmt_vec_info stmt_vinfo = dr_info->stmt;
3210   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3211   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3212   unsigned HOST_WIDE_INT access_size = ref_size;
3213   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3214     {
3215       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3216       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3217     }
3218   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3219       && (vect_supportable_dr_alignment (dr_info, false)
3220           == dr_explicit_realign_optimized))
3221     {
3222       /* We might access a full vector's worth.  */
3223       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3224       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3225     }
3226   return access_size;
3227 }
3228
3229 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3230    describes.  */
3231
3232 static unsigned int
3233 vect_vfa_align (dr_vec_info *dr_info)
3234 {
3235   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3236 }
3237
3238 /* Function vect_no_alias_p.
3239
3240    Given data references A and B with equal base and offset, see whether
3241    the alias relation can be decided at compilation time.  Return 1 if
3242    it can and the references alias, 0 if it can and the references do
3243    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3244    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3245    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3246
3247 static int
3248 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3249                          tree segment_length_a, tree segment_length_b,
3250                          unsigned HOST_WIDE_INT access_size_a,
3251                          unsigned HOST_WIDE_INT access_size_b)
3252 {
3253   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3254   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3255   poly_uint64 const_length_a;
3256   poly_uint64 const_length_b;
3257
3258   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3259      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3260      [a, a+12) */
3261   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3262     {
3263       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3264       offset_a = (offset_a + access_size_a) - const_length_a;
3265     }
3266   else
3267     const_length_a = tree_to_poly_uint64 (segment_length_a);
3268   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3269     {
3270       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3271       offset_b = (offset_b + access_size_b) - const_length_b;
3272     }
3273   else
3274     const_length_b = tree_to_poly_uint64 (segment_length_b);
3275
3276   const_length_a += access_size_a;
3277   const_length_b += access_size_b;
3278
3279   if (ranges_known_overlap_p (offset_a, const_length_a,
3280                               offset_b, const_length_b))
3281     return 1;
3282
3283   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3284                                offset_b, const_length_b))
3285     return 0;
3286
3287   return -1;
3288 }
3289
3290 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3291    in DDR is >= VF.  */
3292
3293 static bool
3294 dependence_distance_ge_vf (data_dependence_relation *ddr,
3295                            unsigned int loop_depth, poly_uint64 vf)
3296 {
3297   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3298       || DDR_NUM_DIST_VECTS (ddr) == 0)
3299     return false;
3300
3301   /* If the dependence is exact, we should have limited the VF instead.  */
3302   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3303
3304   unsigned int i;
3305   lambda_vector dist_v;
3306   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3307     {
3308       HOST_WIDE_INT dist = dist_v[loop_depth];
3309       if (dist != 0
3310           && !(dist > 0 && DDR_REVERSED_P (ddr))
3311           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3312         return false;
3313     }
3314
3315   if (dump_enabled_p ())
3316     dump_printf_loc (MSG_NOTE, vect_location,
3317                      "dependence distance between %T and %T is >= VF\n",
3318                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3319
3320   return true;
3321 }
3322
3323 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3324
3325 static void
3326 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3327 {
3328   dump_printf (dump_kind, "%s (%T) >= ",
3329                lower_bound.unsigned_p ? "unsigned" : "abs",
3330                lower_bound.expr);
3331   dump_dec (dump_kind, lower_bound.min_value);
3332 }
3333
3334 /* Record that the vectorized loop requires the vec_lower_bound described
3335    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3336
3337 static void
3338 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3339                         poly_uint64 min_value)
3340 {
3341   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3342   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3343     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3344       {
3345         unsigned_p &= lower_bounds[i].unsigned_p;
3346         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3347         if (lower_bounds[i].unsigned_p != unsigned_p
3348             || maybe_lt (lower_bounds[i].min_value, min_value))
3349           {
3350             lower_bounds[i].unsigned_p = unsigned_p;
3351             lower_bounds[i].min_value = min_value;
3352             if (dump_enabled_p ())
3353               {
3354                 dump_printf_loc (MSG_NOTE, vect_location,
3355                                  "updating run-time check to ");
3356                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3357                 dump_printf (MSG_NOTE, "\n");
3358               }
3359           }
3360         return;
3361       }
3362
3363   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3364   if (dump_enabled_p ())
3365     {
3366       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3367       dump_lower_bound (MSG_NOTE, lower_bound);
3368       dump_printf (MSG_NOTE, "\n");
3369     }
3370   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3371 }
3372
3373 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3374    will span fewer than GAP bytes.  */
3375
3376 static bool
3377 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3378                   poly_int64 gap)
3379 {
3380   stmt_vec_info stmt_info = dr_info->stmt;
3381   HOST_WIDE_INT count
3382     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3383   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3384     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3385   return (estimated_poly_value (gap)
3386           <= count * vect_get_scalar_dr_size (dr_info));
3387 }
3388
3389 /* Return true if we know that there is no alias between DR_INFO_A and
3390    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3391    When returning true, set *LOWER_BOUND_OUT to this N.  */
3392
3393 static bool
3394 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3395                                 poly_uint64 *lower_bound_out)
3396 {
3397   /* Check that there is a constant gap of known sign between DR_A
3398      and DR_B.  */
3399   data_reference *dr_a = dr_info_a->dr;
3400   data_reference *dr_b = dr_info_b->dr;
3401   poly_int64 init_a, init_b;
3402   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3403       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3404       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3405       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3406       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3407       || !ordered_p (init_a, init_b))
3408     return false;
3409
3410   /* Sort DR_A and DR_B by the address they access.  */
3411   if (maybe_lt (init_b, init_a))
3412     {
3413       std::swap (init_a, init_b);
3414       std::swap (dr_info_a, dr_info_b);
3415       std::swap (dr_a, dr_b);
3416     }
3417
3418   /* If the two accesses could be dependent within a scalar iteration,
3419      make sure that we'd retain their order.  */
3420   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3421       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3422     return false;
3423
3424   /* There is no alias if abs (DR_STEP) is greater than or equal to
3425      the bytes spanned by the combination of the two accesses.  */
3426   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3427   return true;
3428 }
3429
3430 /* Function vect_prune_runtime_alias_test_list.
3431
3432    Prune a list of ddrs to be tested at run-time by versioning for alias.
3433    Merge several alias checks into one if possible.
3434    Return FALSE if resulting list of ddrs is longer then allowed by
3435    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3436
3437 opt_result
3438 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3439 {
3440   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3441   hash_set <tree_pair_hash> compared_objects;
3442
3443   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3444   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3445     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3446   vec<vec_object_pair> &check_unequal_addrs
3447     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3448   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3449   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3450
3451   ddr_p ddr;
3452   unsigned int i;
3453   tree length_factor;
3454
3455   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3456
3457   /* Step values are irrelevant for aliasing if the number of vector
3458      iterations is equal to the number of scalar iterations (which can
3459      happen for fully-SLP loops).  */
3460   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3461
3462   if (!ignore_step_p)
3463     {
3464       /* Convert the checks for nonzero steps into bound tests.  */
3465       tree value;
3466       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3467         vect_check_lower_bound (loop_vinfo, value, true, 1);
3468     }
3469
3470   if (may_alias_ddrs.is_empty ())
3471     return opt_result::success ();
3472
3473   comp_alias_ddrs.create (may_alias_ddrs.length ());
3474
3475   unsigned int loop_depth
3476     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3477                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3478
3479   /* First, we collect all data ref pairs for aliasing checks.  */
3480   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3481     {
3482       poly_uint64 lower_bound;
3483       tree segment_length_a, segment_length_b;
3484       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3485       unsigned int align_a, align_b;
3486
3487       /* Ignore the alias if the VF we chose ended up being no greater
3488          than the dependence distance.  */
3489       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3490         continue;
3491
3492       if (DDR_OBJECT_A (ddr))
3493         {
3494           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3495           if (!compared_objects.add (new_pair))
3496             {
3497               if (dump_enabled_p ())
3498                 dump_printf_loc (MSG_NOTE, vect_location,
3499                                  "checking that %T and %T"
3500                                  " have different addresses\n",
3501                                  new_pair.first, new_pair.second);
3502               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3503             }
3504           continue;
3505         }
3506
3507       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3508       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3509
3510       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3511       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3512
3513       bool preserves_scalar_order_p
3514         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3515
3516       /* Skip the pair if inter-iteration dependencies are irrelevant
3517          and intra-iteration dependencies are guaranteed to be honored.  */
3518       if (ignore_step_p
3519           && (preserves_scalar_order_p
3520               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3521                                                  &lower_bound)))
3522         {
3523           if (dump_enabled_p ())
3524             dump_printf_loc (MSG_NOTE, vect_location,
3525                              "no need for alias check between "
3526                              "%T and %T when VF is 1\n",
3527                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3528           continue;
3529         }
3530
3531       /* See whether we can handle the alias using a bounds check on
3532          the step, and whether that's likely to be the best approach.
3533          (It might not be, for example, if the minimum step is much larger
3534          than the number of bytes handled by one vector iteration.)  */
3535       if (!ignore_step_p
3536           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3537           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3538                                              &lower_bound)
3539           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3540               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3541         {
3542           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3543           if (dump_enabled_p ())
3544             {
3545               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3546                                "%T and %T when the step %T is outside ",
3547                                DR_REF (dr_info_a->dr),
3548                                DR_REF (dr_info_b->dr),
3549                                DR_STEP (dr_info_a->dr));
3550               if (unsigned_p)
3551                 dump_printf (MSG_NOTE, "[0");
3552               else
3553                 {
3554                   dump_printf (MSG_NOTE, "(");
3555                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3556                 }
3557               dump_printf (MSG_NOTE, ", ");
3558               dump_dec (MSG_NOTE, lower_bound);
3559               dump_printf (MSG_NOTE, ")\n");
3560             }
3561           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3562                                   unsigned_p, lower_bound);
3563           continue;
3564         }
3565
3566       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3567       if (dr_group_first_a)
3568         {
3569           stmt_info_a = dr_group_first_a;
3570           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3571         }
3572
3573       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3574       if (dr_group_first_b)
3575         {
3576           stmt_info_b = dr_group_first_b;
3577           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3578         }
3579
3580       if (ignore_step_p)
3581         {
3582           segment_length_a = size_zero_node;
3583           segment_length_b = size_zero_node;
3584         }
3585       else
3586         {
3587           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3588                                 DR_STEP (dr_info_b->dr), 0))
3589             length_factor = scalar_loop_iters;
3590           else
3591             length_factor = size_int (vect_factor);
3592           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3593           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3594         }
3595       access_size_a = vect_vfa_access_size (dr_info_a);
3596       access_size_b = vect_vfa_access_size (dr_info_b);
3597       align_a = vect_vfa_align (dr_info_a);
3598       align_b = vect_vfa_align (dr_info_b);
3599
3600       /* See whether the alias is known at compilation time.  */
3601       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3602                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3603           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3604                               DR_OFFSET (dr_info_b->dr), 0)
3605           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3606           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3607           && poly_int_tree_p (segment_length_a)
3608           && poly_int_tree_p (segment_length_b))
3609         {
3610           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3611                                              segment_length_a,
3612                                              segment_length_b,
3613                                              access_size_a,
3614                                              access_size_b);
3615           if (res >= 0 && dump_enabled_p ())
3616             {
3617               dump_printf_loc (MSG_NOTE, vect_location,
3618                                "can tell at compile time that %T and %T",
3619                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3620               if (res == 0)
3621                 dump_printf (MSG_NOTE, " do not alias\n");
3622               else
3623                 dump_printf (MSG_NOTE, " alias\n");
3624             }
3625
3626           if (res == 0)
3627             continue;
3628
3629           if (res == 1)
3630             return opt_result::failure_at (stmt_info_b->stmt,
3631                                            "not vectorized:"
3632                                            " compilation time alias: %G%G",
3633                                            stmt_info_a->stmt,
3634                                            stmt_info_b->stmt);
3635         }
3636
3637       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3638                             access_size_a, align_a);
3639       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3640                             access_size_b, align_b);
3641       /* Canonicalize the order to be the one that's needed for accurate
3642          RAW, WAR and WAW flags, in cases where the data references are
3643          well-ordered.  The order doesn't really matter otherwise,
3644          but we might as well be consistent.  */
3645       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3646         std::swap (dr_a, dr_b);
3647
3648       dr_with_seg_len_pair_t dr_with_seg_len_pair
3649         (dr_a, dr_b, (preserves_scalar_order_p
3650                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3651                       : dr_with_seg_len_pair_t::REORDERED));
3652
3653       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3654     }
3655
3656   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3657
3658   unsigned int count = (comp_alias_ddrs.length ()
3659                         + check_unequal_addrs.length ());
3660
3661   if (dump_enabled_p ())
3662     dump_printf_loc (MSG_NOTE, vect_location,
3663                      "improved number of alias checks from %d to %d\n",
3664                      may_alias_ddrs.length (), count);
3665   unsigned limit = param_vect_max_version_for_alias_checks;
3666   if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3667     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3668   if (count > limit)
3669     return opt_result::failure_at
3670       (vect_location,
3671        "number of versioning for alias run-time tests exceeds %d "
3672        "(--param vect-max-version-for-alias-checks)\n", limit);
3673
3674   return opt_result::success ();
3675 }
3676
3677 /* Check whether we can use an internal function for a gather load
3678    or scatter store.  READ_P is true for loads and false for stores.
3679    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3680    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3681    is the type of the offset that is being applied to the invariant
3682    base address.  SCALE is the amount by which the offset should
3683    be multiplied *after* it has been converted to address width.
3684
3685    Return true if the function is supported, storing the function id in
3686    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3687
3688 bool
3689 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3690                           tree vectype, tree memory_type, tree offset_type,
3691                           int scale, internal_fn *ifn_out,
3692                           tree *offset_vectype_out)
3693 {
3694   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3695   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3696   if (element_bits != memory_bits)
3697     /* For now the vector elements must be the same width as the
3698        memory elements.  */
3699     return false;
3700
3701   /* Work out which function we need.  */
3702   internal_fn ifn;
3703   if (read_p)
3704     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3705   else
3706     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3707
3708   for (;;)
3709     {
3710       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3711       if (!offset_vectype)
3712         return false;
3713
3714       /* Test whether the target supports this combination.  */
3715       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3716                                                   offset_vectype, scale))
3717         {
3718           *ifn_out = ifn;
3719           *offset_vectype_out = offset_vectype;
3720           return true;
3721         }
3722
3723       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3724           && TYPE_PRECISION (offset_type) >= element_bits)
3725         return false;
3726
3727       offset_type = build_nonstandard_integer_type
3728         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3729     }
3730 }
3731
3732 /* STMT_INFO is a call to an internal gather load or scatter store function.
3733    Describe the operation in INFO.  */
3734
3735 static void
3736 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3737                                    gather_scatter_info *info)
3738 {
3739   gcall *call = as_a <gcall *> (stmt_info->stmt);
3740   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3741   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3742
3743   info->ifn = gimple_call_internal_fn (call);
3744   info->decl = NULL_TREE;
3745   info->base = gimple_call_arg (call, 0);
3746   info->offset = gimple_call_arg (call, 1);
3747   info->offset_dt = vect_unknown_def_type;
3748   info->offset_vectype = NULL_TREE;
3749   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3750   info->element_type = TREE_TYPE (vectype);
3751   info->memory_type = TREE_TYPE (DR_REF (dr));
3752 }
3753
3754 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3755    gather load or scatter store.  Describe the operation in *INFO if so.  */
3756
3757 bool
3758 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3759                            gather_scatter_info *info)
3760 {
3761   HOST_WIDE_INT scale = 1;
3762   poly_int64 pbitpos, pbitsize;
3763   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3764   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3765   tree offtype = NULL_TREE;
3766   tree decl = NULL_TREE, base, off;
3767   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3768   tree memory_type = TREE_TYPE (DR_REF (dr));
3769   machine_mode pmode;
3770   int punsignedp, reversep, pvolatilep = 0;
3771   internal_fn ifn;
3772   tree offset_vectype;
3773   bool masked_p = false;
3774
3775   /* See whether this is already a call to a gather/scatter internal function.
3776      If not, see whether it's a masked load or store.  */
3777   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3778   if (call && gimple_call_internal_p (call))
3779     {
3780       ifn = gimple_call_internal_fn (call);
3781       if (internal_gather_scatter_fn_p (ifn))
3782         {
3783           vect_describe_gather_scatter_call (stmt_info, info);
3784           return true;
3785         }
3786       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3787     }
3788
3789   /* True if we should aim to use internal functions rather than
3790      built-in functions.  */
3791   bool use_ifn_p = (DR_IS_READ (dr)
3792                     ? supports_vec_gather_load_p ()
3793                     : supports_vec_scatter_store_p ());
3794
3795   base = DR_REF (dr);
3796   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3797      see if we can use the def stmt of the address.  */
3798   if (masked_p
3799       && TREE_CODE (base) == MEM_REF
3800       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3801       && integer_zerop (TREE_OPERAND (base, 1))
3802       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3803     {
3804       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3805       if (is_gimple_assign (def_stmt)
3806           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3807         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3808     }
3809
3810   /* The gather and scatter builtins need address of the form
3811      loop_invariant + vector * {1, 2, 4, 8}
3812      or
3813      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3814      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3815      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3816      multiplications and additions in it.  To get a vector, we need
3817      a single SSA_NAME that will be defined in the loop and will
3818      contain everything that is not loop invariant and that can be
3819      vectorized.  The following code attempts to find such a preexistng
3820      SSA_NAME OFF and put the loop invariants into a tree BASE
3821      that can be gimplified before the loop.  */
3822   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3823                               &punsignedp, &reversep, &pvolatilep);
3824   if (reversep)
3825     return false;
3826
3827   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3828
3829   if (TREE_CODE (base) == MEM_REF)
3830     {
3831       if (!integer_zerop (TREE_OPERAND (base, 1)))
3832         {
3833           if (off == NULL_TREE)
3834             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3835           else
3836             off = size_binop (PLUS_EXPR, off,
3837                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3838         }
3839       base = TREE_OPERAND (base, 0);
3840     }
3841   else
3842     base = build_fold_addr_expr (base);
3843
3844   if (off == NULL_TREE)
3845     off = size_zero_node;
3846
3847   /* If base is not loop invariant, either off is 0, then we start with just
3848      the constant offset in the loop invariant BASE and continue with base
3849      as OFF, otherwise give up.
3850      We could handle that case by gimplifying the addition of base + off
3851      into some SSA_NAME and use that as off, but for now punt.  */
3852   if (!expr_invariant_in_loop_p (loop, base))
3853     {
3854       if (!integer_zerop (off))
3855         return false;
3856       off = base;
3857       base = size_int (pbytepos);
3858     }
3859   /* Otherwise put base + constant offset into the loop invariant BASE
3860      and continue with OFF.  */
3861   else
3862     {
3863       base = fold_convert (sizetype, base);
3864       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3865     }
3866
3867   /* OFF at this point may be either a SSA_NAME or some tree expression
3868      from get_inner_reference.  Try to peel off loop invariants from it
3869      into BASE as long as possible.  */
3870   STRIP_NOPS (off);
3871   while (offtype == NULL_TREE)
3872     {
3873       enum tree_code code;
3874       tree op0, op1, add = NULL_TREE;
3875
3876       if (TREE_CODE (off) == SSA_NAME)
3877         {
3878           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3879
3880           if (expr_invariant_in_loop_p (loop, off))
3881             return false;
3882
3883           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3884             break;
3885
3886           op0 = gimple_assign_rhs1 (def_stmt);
3887           code = gimple_assign_rhs_code (def_stmt);
3888           op1 = gimple_assign_rhs2 (def_stmt);
3889         }
3890       else
3891         {
3892           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3893             return false;
3894           code = TREE_CODE (off);
3895           extract_ops_from_tree (off, &code, &op0, &op1);
3896         }
3897       switch (code)
3898         {
3899         case POINTER_PLUS_EXPR:
3900         case PLUS_EXPR:
3901           if (expr_invariant_in_loop_p (loop, op0))
3902             {
3903               add = op0;
3904               off = op1;
3905             do_add:
3906               add = fold_convert (sizetype, add);
3907               if (scale != 1)
3908                 add = size_binop (MULT_EXPR, add, size_int (scale));
3909               base = size_binop (PLUS_EXPR, base, add);
3910               continue;
3911             }
3912           if (expr_invariant_in_loop_p (loop, op1))
3913             {
3914               add = op1;
3915               off = op0;
3916               goto do_add;
3917             }
3918           break;
3919         case MINUS_EXPR:
3920           if (expr_invariant_in_loop_p (loop, op1))
3921             {
3922               add = fold_convert (sizetype, op1);
3923               add = size_binop (MINUS_EXPR, size_zero_node, add);
3924               off = op0;
3925               goto do_add;
3926             }
3927           break;
3928         case MULT_EXPR:
3929           if (scale == 1 && tree_fits_shwi_p (op1))
3930             {
3931               int new_scale = tree_to_shwi (op1);
3932               /* Only treat this as a scaling operation if the target
3933                  supports it for at least some offset type.  */
3934               if (use_ifn_p
3935                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3936                                                 masked_p, vectype, memory_type,
3937                                                 signed_char_type_node,
3938                                                 new_scale, &ifn,
3939                                                 &offset_vectype)
3940                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3941                                                 masked_p, vectype, memory_type,
3942                                                 unsigned_char_type_node,
3943                                                 new_scale, &ifn,
3944                                                 &offset_vectype))
3945                 break;
3946               scale = new_scale;
3947               off = op0;
3948               continue;
3949             }
3950           break;
3951         case SSA_NAME:
3952           off = op0;
3953           continue;
3954         CASE_CONVERT:
3955           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3956               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3957             break;
3958
3959           /* Don't include the conversion if the target is happy with
3960              the current offset type.  */
3961           if (use_ifn_p
3962               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3963                                            masked_p, vectype, memory_type,
3964                                            TREE_TYPE (off), scale, &ifn,
3965                                            &offset_vectype))
3966             break;
3967
3968           if (TYPE_PRECISION (TREE_TYPE (op0))
3969               == TYPE_PRECISION (TREE_TYPE (off)))
3970             {
3971               off = op0;
3972               continue;
3973             }
3974
3975           if (TYPE_PRECISION (TREE_TYPE (op0))
3976               < TYPE_PRECISION (TREE_TYPE (off)))
3977             {
3978               off = op0;
3979               offtype = TREE_TYPE (off);
3980               STRIP_NOPS (off);
3981               continue;
3982             }
3983           break;
3984         default:
3985           break;
3986         }
3987       break;
3988     }
3989
3990   /* If at the end OFF still isn't a SSA_NAME or isn't
3991      defined in the loop, punt.  */
3992   if (TREE_CODE (off) != SSA_NAME
3993       || expr_invariant_in_loop_p (loop, off))
3994     return false;
3995
3996   if (offtype == NULL_TREE)
3997     offtype = TREE_TYPE (off);
3998
3999   if (use_ifn_p)
4000     {
4001       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4002                                      vectype, memory_type, offtype, scale,
4003                                      &ifn, &offset_vectype))
4004         return false;
4005     }
4006   else
4007     {
4008       if (DR_IS_READ (dr))
4009         {
4010           if (targetm.vectorize.builtin_gather)
4011             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4012         }
4013       else
4014         {
4015           if (targetm.vectorize.builtin_scatter)
4016             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4017         }
4018
4019       if (!decl)
4020         return false;
4021
4022       ifn = IFN_LAST;
4023       /* The offset vector type will be read from DECL when needed.  */
4024       offset_vectype = NULL_TREE;
4025     }
4026
4027   info->ifn = ifn;
4028   info->decl = decl;
4029   info->base = base;
4030   info->offset = off;
4031   info->offset_dt = vect_unknown_def_type;
4032   info->offset_vectype = offset_vectype;
4033   info->scale = scale;
4034   info->element_type = TREE_TYPE (vectype);
4035   info->memory_type = memory_type;
4036   return true;
4037 }
4038
4039 /* Find the data references in STMT, analyze them with respect to LOOP and
4040    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4041    be handled.  */
4042
4043 opt_result
4044 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4045                                vec<data_reference_p> *datarefs)
4046 {
4047   /* We can ignore clobbers for dataref analysis - they are removed during
4048      loop vectorization and BB vectorization checks dependences with a
4049      stmt walk.  */
4050   if (gimple_clobber_p (stmt))
4051     return opt_result::success ();
4052
4053   if (gimple_has_volatile_ops (stmt))
4054     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4055                                    stmt);
4056
4057   if (stmt_can_throw_internal (cfun, stmt))
4058     return opt_result::failure_at (stmt,
4059                                    "not vectorized:"
4060                                    " statement can throw an exception: %G",
4061                                    stmt);
4062
4063   auto_vec<data_reference_p, 2> refs;
4064   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4065   if (!res)
4066     return res;
4067
4068   if (refs.is_empty ())
4069     return opt_result::success ();
4070
4071   if (refs.length () > 1)
4072     return opt_result::failure_at (stmt,
4073                                    "not vectorized:"
4074                                    " more than one data ref in stmt: %G", stmt);
4075
4076   if (gcall *call = dyn_cast <gcall *> (stmt))
4077     if (!gimple_call_internal_p (call)
4078         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4079             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4080       return opt_result::failure_at (stmt,
4081                                      "not vectorized: dr in a call %G", stmt);
4082
4083   data_reference_p dr = refs.pop ();
4084   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4085       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4086     return opt_result::failure_at (stmt,
4087                                    "not vectorized:"
4088                                    " statement is bitfield access %G", stmt);
4089
4090   if (DR_BASE_ADDRESS (dr)
4091       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4092     return opt_result::failure_at (stmt,
4093                                    "not vectorized:"
4094                                    " base addr of dr is a constant\n");
4095
4096   /* Check whether this may be a SIMD lane access and adjust the
4097      DR to make it easier for us to handle it.  */
4098   if (loop
4099       && loop->simduid
4100       && (!DR_BASE_ADDRESS (dr)
4101           || !DR_OFFSET (dr)
4102           || !DR_INIT (dr)
4103           || !DR_STEP (dr)))
4104     {
4105       struct data_reference *newdr
4106         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4107                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4108       if (DR_BASE_ADDRESS (newdr)
4109           && DR_OFFSET (newdr)
4110           && DR_INIT (newdr)
4111           && DR_STEP (newdr)
4112           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4113           && integer_zerop (DR_STEP (newdr)))
4114         {
4115           tree base_address = DR_BASE_ADDRESS (newdr);
4116           tree off = DR_OFFSET (newdr);
4117           tree step = ssize_int (1);
4118           if (integer_zerop (off)
4119               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4120             {
4121               off = TREE_OPERAND (base_address, 1);
4122               base_address = TREE_OPERAND (base_address, 0);
4123             }
4124           STRIP_NOPS (off);
4125           if (TREE_CODE (off) == MULT_EXPR
4126               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4127             {
4128               step = TREE_OPERAND (off, 1);
4129               off = TREE_OPERAND (off, 0);
4130               STRIP_NOPS (off);
4131             }
4132           if (CONVERT_EXPR_P (off)
4133               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4134                   < TYPE_PRECISION (TREE_TYPE (off))))
4135             off = TREE_OPERAND (off, 0);
4136           if (TREE_CODE (off) == SSA_NAME)
4137             {
4138               gimple *def = SSA_NAME_DEF_STMT (off);
4139               /* Look through widening conversion.  */
4140               if (is_gimple_assign (def)
4141                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4142                 {
4143                   tree rhs1 = gimple_assign_rhs1 (def);
4144                   if (TREE_CODE (rhs1) == SSA_NAME
4145                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4146                       && (TYPE_PRECISION (TREE_TYPE (off))
4147                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4148                     def = SSA_NAME_DEF_STMT (rhs1);
4149                 }
4150               if (is_gimple_call (def)
4151                   && gimple_call_internal_p (def)
4152                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4153                 {
4154                   tree arg = gimple_call_arg (def, 0);
4155                   tree reft = TREE_TYPE (DR_REF (newdr));
4156                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4157                   arg = SSA_NAME_VAR (arg);
4158                   if (arg == loop->simduid
4159                       /* For now.  */
4160                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4161                     {
4162                       DR_BASE_ADDRESS (newdr) = base_address;
4163                       DR_OFFSET (newdr) = ssize_int (0);
4164                       DR_STEP (newdr) = step;
4165                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4166                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4167                       /* Mark as simd-lane access.  */
4168                       tree arg2 = gimple_call_arg (def, 1);
4169                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4170                       free_data_ref (dr);
4171                       datarefs->safe_push (newdr);
4172                       return opt_result::success ();
4173                     }
4174                 }
4175             }
4176         }
4177       free_data_ref (newdr);
4178     }
4179
4180   datarefs->safe_push (dr);
4181   return opt_result::success ();
4182 }
4183
4184 /* Function vect_analyze_data_refs.
4185
4186   Find all the data references in the loop or basic block.
4187
4188    The general structure of the analysis of data refs in the vectorizer is as
4189    follows:
4190    1- vect_analyze_data_refs(loop/bb): call
4191       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4192       in the loop/bb and their dependences.
4193    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4194    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4195    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4196
4197 */
4198
4199 opt_result
4200 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4201 {
4202   class loop *loop = NULL;
4203   unsigned int i;
4204   struct data_reference *dr;
4205   tree scalar_type;
4206
4207   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4208
4209   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4210     loop = LOOP_VINFO_LOOP (loop_vinfo);
4211
4212   /* Go through the data-refs, check that the analysis succeeded.  Update
4213      pointer from stmt_vec_info struct to DR and vectype.  */
4214
4215   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4216   FOR_EACH_VEC_ELT (datarefs, i, dr)
4217     {
4218       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4219       poly_uint64 vf;
4220
4221       gcc_assert (DR_REF (dr));
4222       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4223       gcc_assert (!stmt_info->dr_aux.dr);
4224       stmt_info->dr_aux.dr = dr;
4225       stmt_info->dr_aux.stmt = stmt_info;
4226
4227       /* Check that analysis of the data-ref succeeded.  */
4228       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4229           || !DR_STEP (dr))
4230         {
4231           bool maybe_gather
4232             = DR_IS_READ (dr)
4233               && !TREE_THIS_VOLATILE (DR_REF (dr))
4234               && (targetm.vectorize.builtin_gather != NULL
4235                   || supports_vec_gather_load_p ());
4236           bool maybe_scatter
4237             = DR_IS_WRITE (dr)
4238               && !TREE_THIS_VOLATILE (DR_REF (dr))
4239               && (targetm.vectorize.builtin_scatter != NULL
4240                   || supports_vec_scatter_store_p ());
4241
4242           /* If target supports vector gather loads or scatter stores,
4243              see if they can't be used.  */
4244           if (is_a <loop_vec_info> (vinfo)
4245               && !nested_in_vect_loop_p (loop, stmt_info))
4246             {
4247               if (maybe_gather || maybe_scatter)
4248                 {
4249                   if (maybe_gather)
4250                     gatherscatter = GATHER;
4251                   else
4252                     gatherscatter = SCATTER;
4253                 }
4254             }
4255
4256           if (gatherscatter == SG_NONE)
4257             {
4258               if (dump_enabled_p ())
4259                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4260                                  "not vectorized: data ref analysis "
4261                                  "failed %G", stmt_info->stmt);
4262               if (is_a <bb_vec_info> (vinfo))
4263                 {
4264                   /* In BB vectorization the ref can still participate
4265                      in dependence analysis, we just can't vectorize it.  */
4266                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4267                   continue;
4268                 }
4269               return opt_result::failure_at (stmt_info->stmt,
4270                                              "not vectorized:"
4271                                              " data ref analysis failed: %G",
4272                                              stmt_info->stmt);
4273             }
4274         }
4275
4276       /* See if this was detected as SIMD lane access.  */
4277       if (dr->aux == (void *)-1
4278           || dr->aux == (void *)-2
4279           || dr->aux == (void *)-3
4280           || dr->aux == (void *)-4)
4281         {
4282           if (nested_in_vect_loop_p (loop, stmt_info))
4283             return opt_result::failure_at (stmt_info->stmt,
4284                                            "not vectorized:"
4285                                            " data ref analysis failed: %G",
4286                                            stmt_info->stmt);
4287           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4288             = -(uintptr_t) dr->aux;
4289         }
4290
4291       tree base = get_base_address (DR_REF (dr));
4292       if (base && VAR_P (base) && DECL_NONALIASED (base))
4293         {
4294           if (dump_enabled_p ())
4295             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4296                              "not vectorized: base object not addressable "
4297                              "for stmt: %G", stmt_info->stmt);
4298           if (is_a <bb_vec_info> (vinfo))
4299             {
4300               /* In BB vectorization the ref can still participate
4301                  in dependence analysis, we just can't vectorize it.  */
4302               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4303               continue;
4304             }
4305           return opt_result::failure_at (stmt_info->stmt,
4306                                          "not vectorized: base object not"
4307                                          " addressable for stmt: %G",
4308                                          stmt_info->stmt);
4309         }
4310
4311       if (is_a <loop_vec_info> (vinfo)
4312           && DR_STEP (dr)
4313           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4314         {
4315           if (nested_in_vect_loop_p (loop, stmt_info))
4316             return opt_result::failure_at (stmt_info->stmt,
4317                                            "not vectorized: "
4318                                            "not suitable for strided load %G",
4319                                            stmt_info->stmt);
4320           STMT_VINFO_STRIDED_P (stmt_info) = true;
4321         }
4322
4323       /* Update DR field in stmt_vec_info struct.  */
4324
4325       /* If the dataref is in an inner-loop of the loop that is considered for
4326          for vectorization, we also want to analyze the access relative to
4327          the outer-loop (DR contains information only relative to the
4328          inner-most enclosing loop).  We do that by building a reference to the
4329          first location accessed by the inner-loop, and analyze it relative to
4330          the outer-loop.  */
4331       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4332         {
4333           /* Build a reference to the first location accessed by the
4334              inner loop: *(BASE + INIT + OFFSET).  By construction,
4335              this address must be invariant in the inner loop, so we
4336              can consider it as being used in the outer loop.  */
4337           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4338           tree offset = unshare_expr (DR_OFFSET (dr));
4339           tree init = unshare_expr (DR_INIT (dr));
4340           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4341                                           init, offset);
4342           tree init_addr = fold_build_pointer_plus (base, init_offset);
4343           tree init_ref = build_fold_indirect_ref (init_addr);
4344
4345           if (dump_enabled_p ())
4346             dump_printf_loc (MSG_NOTE, vect_location,
4347                              "analyze in outer loop: %T\n", init_ref);
4348
4349           opt_result res
4350             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4351                                     init_ref, loop, stmt_info->stmt);
4352           if (!res)
4353             /* dr_analyze_innermost already explained the failure.  */
4354             return res;
4355
4356           if (dump_enabled_p ())
4357             dump_printf_loc (MSG_NOTE, vect_location,
4358                              "\touter base_address: %T\n"
4359                              "\touter offset from base address: %T\n"
4360                              "\touter constant offset from base address: %T\n"
4361                              "\touter step: %T\n"
4362                              "\touter base alignment: %d\n\n"
4363                              "\touter base misalignment: %d\n"
4364                              "\touter offset alignment: %d\n"
4365                              "\touter step alignment: %d\n",
4366                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4367                              STMT_VINFO_DR_OFFSET (stmt_info),
4368                              STMT_VINFO_DR_INIT (stmt_info),
4369                              STMT_VINFO_DR_STEP (stmt_info),
4370                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4371                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4372                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4373                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4374         }
4375
4376       /* Set vectype for STMT.  */
4377       scalar_type = TREE_TYPE (DR_REF (dr));
4378       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4379       if (!vectype)
4380         {
4381           if (dump_enabled_p ())
4382             {
4383               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4384                                "not vectorized: no vectype for stmt: %G",
4385                                stmt_info->stmt);
4386               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4387               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4388                                  scalar_type);
4389               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4390             }
4391
4392           if (is_a <bb_vec_info> (vinfo))
4393             {
4394               /* No vector type is fine, the ref can still participate
4395                  in dependence analysis, we just can't vectorize it.  */
4396               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4397               continue;
4398             }
4399           if (fatal)
4400             *fatal = false;
4401           return opt_result::failure_at (stmt_info->stmt,
4402                                          "not vectorized:"
4403                                          " no vectype for stmt: %G"
4404                                          " scalar_type: %T\n",
4405                                          stmt_info->stmt, scalar_type);
4406         }
4407       else
4408         {
4409           if (dump_enabled_p ())
4410             dump_printf_loc (MSG_NOTE, vect_location,
4411                              "got vectype for stmt: %G%T\n",
4412                              stmt_info->stmt, vectype);
4413         }
4414
4415       /* Adjust the minimal vectorization factor according to the
4416          vector type.  */
4417       vf = TYPE_VECTOR_SUBPARTS (vectype);
4418       *min_vf = upper_bound (*min_vf, vf);
4419
4420       /* Leave the BB vectorizer to pick the vector type later, based on
4421          the final dataref group size and SLP node size.  */
4422       if (is_a <loop_vec_info> (vinfo))
4423         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4424
4425       if (gatherscatter != SG_NONE)
4426         {
4427           gather_scatter_info gs_info;
4428           if (!vect_check_gather_scatter (stmt_info,
4429                                           as_a <loop_vec_info> (vinfo),
4430                                           &gs_info)
4431               || !get_vectype_for_scalar_type (vinfo,
4432                                                TREE_TYPE (gs_info.offset)))
4433             {
4434               if (fatal)
4435                 *fatal = false;
4436               return opt_result::failure_at
4437                         (stmt_info->stmt,
4438                          (gatherscatter == GATHER)
4439                          ? "not vectorized: not suitable for gather load %G"
4440                          : "not vectorized: not suitable for scatter store %G",
4441                          stmt_info->stmt);
4442             }
4443           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4444         }
4445     }
4446
4447   /* We used to stop processing and prune the list here.  Verify we no
4448      longer need to.  */
4449   gcc_assert (i == datarefs.length ());
4450
4451   return opt_result::success ();
4452 }
4453
4454
4455 /* Function vect_get_new_vect_var.
4456
4457    Returns a name for a new variable.  The current naming scheme appends the
4458    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4459    the name of vectorizer generated variables, and appends that to NAME if
4460    provided.  */
4461
4462 tree
4463 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4464 {
4465   const char *prefix;
4466   tree new_vect_var;
4467
4468   switch (var_kind)
4469   {
4470   case vect_simple_var:
4471     prefix = "vect";
4472     break;
4473   case vect_scalar_var:
4474     prefix = "stmp";
4475     break;
4476   case vect_mask_var:
4477     prefix = "mask";
4478     break;
4479   case vect_pointer_var:
4480     prefix = "vectp";
4481     break;
4482   default:
4483     gcc_unreachable ();
4484   }
4485
4486   if (name)
4487     {
4488       char* tmp = concat (prefix, "_", name, NULL);
4489       new_vect_var = create_tmp_reg (type, tmp);
4490       free (tmp);
4491     }
4492   else
4493     new_vect_var = create_tmp_reg (type, prefix);
4494
4495   return new_vect_var;
4496 }
4497
4498 /* Like vect_get_new_vect_var but return an SSA name.  */
4499
4500 tree
4501 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4502 {
4503   const char *prefix;
4504   tree new_vect_var;
4505
4506   switch (var_kind)
4507   {
4508   case vect_simple_var:
4509     prefix = "vect";
4510     break;
4511   case vect_scalar_var:
4512     prefix = "stmp";
4513     break;
4514   case vect_pointer_var:
4515     prefix = "vectp";
4516     break;
4517   default:
4518     gcc_unreachable ();
4519   }
4520
4521   if (name)
4522     {
4523       char* tmp = concat (prefix, "_", name, NULL);
4524       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4525       free (tmp);
4526     }
4527   else
4528     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4529
4530   return new_vect_var;
4531 }
4532
4533 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4534
4535 static void
4536 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4537 {
4538   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4539   int misalign = DR_MISALIGNMENT (dr_info);
4540   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4541     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4542   else
4543     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4544                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4545                             misalign);
4546 }
4547
4548 /* Function vect_create_addr_base_for_vector_ref.
4549
4550    Create an expression that computes the address of the first memory location
4551    that will be accessed for a data reference.
4552
4553    Input:
4554    STMT_INFO: The statement containing the data reference.
4555    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4556    OFFSET: Optional. If supplied, it is be added to the initial address.
4557    LOOP:    Specify relative to which loop-nest should the address be computed.
4558             For example, when the dataref is in an inner-loop nested in an
4559             outer-loop that is now being vectorized, LOOP can be either the
4560             outer-loop, or the inner-loop.  The first memory location accessed
4561             by the following dataref ('in' points to short):
4562
4563                 for (i=0; i<N; i++)
4564                    for (j=0; j<M; j++)
4565                      s += in[i+j]
4566
4567             is as follows:
4568             if LOOP=i_loop:     &in             (relative to i_loop)
4569             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4570    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4571             initial address.  Unlike OFFSET, which is number of elements to
4572             be added, BYTE_OFFSET is measured in bytes.
4573
4574    Output:
4575    1. Return an SSA_NAME whose value is the address of the memory location of
4576       the first vector of the data reference.
4577    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4578       these statement(s) which define the returned SSA_NAME.
4579
4580    FORNOW: We are only handling array accesses with step 1.  */
4581
4582 tree
4583 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4584                                       gimple_seq *new_stmt_list,
4585                                       tree offset,
4586                                       tree byte_offset)
4587 {
4588   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4589   struct data_reference *dr = dr_info->dr;
4590   const char *base_name;
4591   tree addr_base;
4592   tree dest;
4593   gimple_seq seq = NULL;
4594   tree vect_ptr_type;
4595   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4596   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4597   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4598
4599   tree data_ref_base = unshare_expr (drb->base_address);
4600   tree base_offset = unshare_expr (get_dr_vinfo_offset (dr_info, true));
4601   tree init = unshare_expr (drb->init);
4602
4603   if (loop_vinfo)
4604     base_name = get_name (data_ref_base);
4605   else
4606     {
4607       base_offset = ssize_int (0);
4608       init = ssize_int (0);
4609       base_name = get_name (DR_REF (dr));
4610     }
4611
4612   /* Create base_offset */
4613   base_offset = size_binop (PLUS_EXPR,
4614                             fold_convert (sizetype, base_offset),
4615                             fold_convert (sizetype, init));
4616
4617   if (offset)
4618     {
4619       offset = fold_build2 (MULT_EXPR, sizetype,
4620                             fold_convert (sizetype, offset), step);
4621       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4622                                  base_offset, offset);
4623     }
4624   if (byte_offset)
4625     {
4626       byte_offset = fold_convert (sizetype, byte_offset);
4627       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4628                                  base_offset, byte_offset);
4629     }
4630
4631   /* base + base_offset */
4632   if (loop_vinfo)
4633     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4634   else
4635     {
4636       addr_base = build1 (ADDR_EXPR,
4637                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4638                           unshare_expr (DR_REF (dr)));
4639     }
4640
4641   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4642   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4643   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4644   gimple_seq_add_seq (new_stmt_list, seq);
4645
4646   if (DR_PTR_INFO (dr)
4647       && TREE_CODE (addr_base) == SSA_NAME
4648       && !SSA_NAME_PTR_INFO (addr_base))
4649     {
4650       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4651       if (offset || byte_offset)
4652         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4653     }
4654
4655   if (dump_enabled_p ())
4656     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4657
4658   return addr_base;
4659 }
4660
4661
4662 /* Function vect_create_data_ref_ptr.
4663
4664    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4665    location accessed in the loop by STMT_INFO, along with the def-use update
4666    chain to appropriately advance the pointer through the loop iterations.
4667    Also set aliasing information for the pointer.  This pointer is used by
4668    the callers to this function to create a memory reference expression for
4669    vector load/store access.
4670
4671    Input:
4672    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4673          GIMPLE_ASSIGN <name, data-ref> or
4674          GIMPLE_ASSIGN <data-ref, name>.
4675    2. AGGR_TYPE: the type of the reference, which should be either a vector
4676         or an array.
4677    3. AT_LOOP: the loop where the vector memref is to be created.
4678    4. OFFSET (optional): an offset to be added to the initial address accessed
4679         by the data-ref in STMT_INFO.
4680    5. BSI: location where the new stmts are to be placed if there is no loop
4681    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4682         pointing to the initial address.
4683    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4684         to the initial address accessed by the data-ref in STMT_INFO.  This is
4685         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4686         in bytes.
4687    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4688         to the IV during each iteration of the loop.  NULL says to move
4689         by one copy of AGGR_TYPE up or down, depending on the step of the
4690         data reference.
4691
4692    Output:
4693    1. Declare a new ptr to vector_type, and have it point to the base of the
4694       data reference (initial addressed accessed by the data reference).
4695       For example, for vector of type V8HI, the following code is generated:
4696
4697       v8hi *ap;
4698       ap = (v8hi *)initial_address;
4699
4700       if OFFSET is not supplied:
4701          initial_address = &a[init];
4702       if OFFSET is supplied:
4703          initial_address = &a[init + OFFSET];
4704       if BYTE_OFFSET is supplied:
4705          initial_address = &a[init] + BYTE_OFFSET;
4706
4707       Return the initial_address in INITIAL_ADDRESS.
4708
4709    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4710       update the pointer in each iteration of the loop.
4711
4712       Return the increment stmt that updates the pointer in PTR_INCR.
4713
4714    3. Return the pointer.  */
4715
4716 tree
4717 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4718                           class loop *at_loop, tree offset,
4719                           tree *initial_address, gimple_stmt_iterator *gsi,
4720                           gimple **ptr_incr, bool only_init,
4721                           tree byte_offset, tree iv_step)
4722 {
4723   const char *base_name;
4724   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4725   class loop *loop = NULL;
4726   bool nested_in_vect_loop = false;
4727   class loop *containing_loop = NULL;
4728   tree aggr_ptr_type;
4729   tree aggr_ptr;
4730   tree new_temp;
4731   gimple_seq new_stmt_list = NULL;
4732   edge pe = NULL;
4733   basic_block new_bb;
4734   tree aggr_ptr_init;
4735   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4736   struct data_reference *dr = dr_info->dr;
4737   tree aptr;
4738   gimple_stmt_iterator incr_gsi;
4739   bool insert_after;
4740   tree indx_before_incr, indx_after_incr;
4741   gimple *incr;
4742   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4743
4744   gcc_assert (iv_step != NULL_TREE
4745               || TREE_CODE (aggr_type) == ARRAY_TYPE
4746               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4747
4748   if (loop_vinfo)
4749     {
4750       loop = LOOP_VINFO_LOOP (loop_vinfo);
4751       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4752       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4753       pe = loop_preheader_edge (loop);
4754     }
4755   else
4756     {
4757       gcc_assert (bb_vinfo);
4758       only_init = true;
4759       *ptr_incr = NULL;
4760     }
4761
4762   /* Create an expression for the first address accessed by this load
4763      in LOOP.  */
4764   base_name = get_name (DR_BASE_ADDRESS (dr));
4765
4766   if (dump_enabled_p ())
4767     {
4768       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4769       dump_printf_loc (MSG_NOTE, vect_location,
4770                        "create %s-pointer variable to type: %T",
4771                        get_tree_code_name (TREE_CODE (aggr_type)),
4772                        aggr_type);
4773       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4774         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4775       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4776         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4777       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4778         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4779       else
4780         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4781       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4782     }
4783
4784   /* (1) Create the new aggregate-pointer variable.
4785      Vector and array types inherit the alias set of their component
4786      type by default so we need to use a ref-all pointer if the data
4787      reference does not conflict with the created aggregated data
4788      reference because it is not addressable.  */
4789   bool need_ref_all = false;
4790   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4791                               get_alias_set (DR_REF (dr))))
4792     need_ref_all = true;
4793   /* Likewise for any of the data references in the stmt group.  */
4794   else if (DR_GROUP_SIZE (stmt_info) > 1)
4795     {
4796       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4797       do
4798         {
4799           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4800           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4801                                       get_alias_set (DR_REF (sdr))))
4802             {
4803               need_ref_all = true;
4804               break;
4805             }
4806           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4807         }
4808       while (sinfo);
4809     }
4810   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4811                                                need_ref_all);
4812   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4813
4814
4815   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4816      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4817      def-use update cycles for the pointer: one relative to the outer-loop
4818      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4819      to the inner-loop (which is the inner-most loop containing the dataref),
4820      and this is done be step (5) below.
4821
4822      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4823      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4824      redundant.  Steps (3),(4) create the following:
4825
4826         vp0 = &base_addr;
4827         LOOP:   vp1 = phi(vp0,vp2)
4828                 ...
4829                 ...
4830                 vp2 = vp1 + step
4831                 goto LOOP
4832
4833      If there is an inner-loop nested in loop, then step (5) will also be
4834      applied, and an additional update in the inner-loop will be created:
4835
4836         vp0 = &base_addr;
4837         LOOP:   vp1 = phi(vp0,vp2)
4838                 ...
4839         inner:     vp3 = phi(vp1,vp4)
4840                    vp4 = vp3 + inner_step
4841                    if () goto inner
4842                 ...
4843                 vp2 = vp1 + step
4844                 if () goto LOOP   */
4845
4846   /* (2) Calculate the initial address of the aggregate-pointer, and set
4847      the aggregate-pointer to point to it before the loop.  */
4848
4849   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4850
4851   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4852                                                    offset, byte_offset);
4853   if (new_stmt_list)
4854     {
4855       if (pe)
4856         {
4857           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4858           gcc_assert (!new_bb);
4859         }
4860       else
4861         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4862     }
4863
4864   *initial_address = new_temp;
4865   aggr_ptr_init = new_temp;
4866
4867   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4868      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4869      inner-loop nested in LOOP (during outer-loop vectorization).  */
4870
4871   /* No update in loop is required.  */
4872   if (only_init && (!loop_vinfo || at_loop == loop))
4873     aptr = aggr_ptr_init;
4874   else
4875     {
4876       /* Accesses to invariant addresses should be handled specially
4877          by the caller.  */
4878       tree step = vect_dr_behavior (dr_info)->step;
4879       gcc_assert (!integer_zerop (step));
4880
4881       if (iv_step == NULL_TREE)
4882         {
4883           /* The step of the aggregate pointer is the type size,
4884              negated for downward accesses.  */
4885           iv_step = TYPE_SIZE_UNIT (aggr_type);
4886           if (tree_int_cst_sgn (step) == -1)
4887             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4888         }
4889
4890       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4891
4892       create_iv (aggr_ptr_init,
4893                  fold_convert (aggr_ptr_type, iv_step),
4894                  aggr_ptr, loop, &incr_gsi, insert_after,
4895                  &indx_before_incr, &indx_after_incr);
4896       incr = gsi_stmt (incr_gsi);
4897       loop_vinfo->add_stmt (incr);
4898
4899       /* Copy the points-to information if it exists. */
4900       if (DR_PTR_INFO (dr))
4901         {
4902           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4903           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4904         }
4905       if (ptr_incr)
4906         *ptr_incr = incr;
4907
4908       aptr = indx_before_incr;
4909     }
4910
4911   if (!nested_in_vect_loop || only_init)
4912     return aptr;
4913
4914
4915   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4916      nested in LOOP, if exists.  */
4917
4918   gcc_assert (nested_in_vect_loop);
4919   if (!only_init)
4920     {
4921       standard_iv_increment_position (containing_loop, &incr_gsi,
4922                                       &insert_after);
4923       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4924                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4925                  &indx_after_incr);
4926       incr = gsi_stmt (incr_gsi);
4927       loop_vinfo->add_stmt (incr);
4928
4929       /* Copy the points-to information if it exists. */
4930       if (DR_PTR_INFO (dr))
4931         {
4932           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4933           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4934         }
4935       if (ptr_incr)
4936         *ptr_incr = incr;
4937
4938       return indx_before_incr;
4939     }
4940   else
4941     gcc_unreachable ();
4942 }
4943
4944
4945 /* Function bump_vector_ptr
4946
4947    Increment a pointer (to a vector type) by vector-size. If requested,
4948    i.e. if PTR-INCR is given, then also connect the new increment stmt
4949    to the existing def-use update-chain of the pointer, by modifying
4950    the PTR_INCR as illustrated below:
4951
4952    The pointer def-use update-chain before this function:
4953                         DATAREF_PTR = phi (p_0, p_2)
4954                         ....
4955         PTR_INCR:       p_2 = DATAREF_PTR + step
4956
4957    The pointer def-use update-chain after this function:
4958                         DATAREF_PTR = phi (p_0, p_2)
4959                         ....
4960                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4961                         ....
4962         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4963
4964    Input:
4965    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4966                  in the loop.
4967    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4968               the loop.  The increment amount across iterations is expected
4969               to be vector_size.
4970    BSI - location where the new update stmt is to be placed.
4971    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4972    BUMP - optional. The offset by which to bump the pointer. If not given,
4973           the offset is assumed to be vector_size.
4974
4975    Output: Return NEW_DATAREF_PTR as illustrated above.
4976
4977 */
4978
4979 tree
4980 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4981                  stmt_vec_info stmt_info, tree bump)
4982 {
4983   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4984   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4985   tree update = TYPE_SIZE_UNIT (vectype);
4986   gassign *incr_stmt;
4987   ssa_op_iter iter;
4988   use_operand_p use_p;
4989   tree new_dataref_ptr;
4990
4991   if (bump)
4992     update = bump;
4993
4994   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4995     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4996   else
4997     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4998   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4999                                    dataref_ptr, update);
5000   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
5001
5002   /* Copy the points-to information if it exists. */
5003   if (DR_PTR_INFO (dr))
5004     {
5005       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5006       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5007     }
5008
5009   if (!ptr_incr)
5010     return new_dataref_ptr;
5011
5012   /* Update the vector-pointer's cross-iteration increment.  */
5013   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5014     {
5015       tree use = USE_FROM_PTR (use_p);
5016
5017       if (use == dataref_ptr)
5018         SET_USE (use_p, new_dataref_ptr);
5019       else
5020         gcc_assert (operand_equal_p (use, update, 0));
5021     }
5022
5023   return new_dataref_ptr;
5024 }
5025
5026
5027 /* Copy memory reference info such as base/clique from the SRC reference
5028    to the DEST MEM_REF.  */
5029
5030 void
5031 vect_copy_ref_info (tree dest, tree src)
5032 {
5033   if (TREE_CODE (dest) != MEM_REF)
5034     return;
5035
5036   tree src_base = src;
5037   while (handled_component_p (src_base))
5038     src_base = TREE_OPERAND (src_base, 0);
5039   if (TREE_CODE (src_base) != MEM_REF
5040       && TREE_CODE (src_base) != TARGET_MEM_REF)
5041     return;
5042
5043   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5044   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5045 }
5046
5047
5048 /* Function vect_create_destination_var.
5049
5050    Create a new temporary of type VECTYPE.  */
5051
5052 tree
5053 vect_create_destination_var (tree scalar_dest, tree vectype)
5054 {
5055   tree vec_dest;
5056   const char *name;
5057   char *new_name;
5058   tree type;
5059   enum vect_var_kind kind;
5060
5061   kind = vectype
5062     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5063     ? vect_mask_var
5064     : vect_simple_var
5065     : vect_scalar_var;
5066   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5067
5068   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5069
5070   name = get_name (scalar_dest);
5071   if (name)
5072     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5073   else
5074     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5075   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5076   free (new_name);
5077
5078   return vec_dest;
5079 }
5080
5081 /* Function vect_grouped_store_supported.
5082
5083    Returns TRUE if interleave high and interleave low permutations
5084    are supported, and FALSE otherwise.  */
5085
5086 bool
5087 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5088 {
5089   machine_mode mode = TYPE_MODE (vectype);
5090
5091   /* vect_permute_store_chain requires the group size to be equal to 3 or
5092      be a power of two.  */
5093   if (count != 3 && exact_log2 (count) == -1)
5094     {
5095       if (dump_enabled_p ())
5096         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5097                          "the size of the group of accesses"
5098                          " is not a power of 2 or not eqaul to 3\n");
5099       return false;
5100     }
5101
5102   /* Check that the permutation is supported.  */
5103   if (VECTOR_MODE_P (mode))
5104     {
5105       unsigned int i;
5106       if (count == 3)
5107         {
5108           unsigned int j0 = 0, j1 = 0, j2 = 0;
5109           unsigned int i, j;
5110
5111           unsigned int nelt;
5112           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5113             {
5114               if (dump_enabled_p ())
5115                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5116                                  "cannot handle groups of 3 stores for"
5117                                  " variable-length vectors\n");
5118               return false;
5119             }
5120
5121           vec_perm_builder sel (nelt, nelt, 1);
5122           sel.quick_grow (nelt);
5123           vec_perm_indices indices;
5124           for (j = 0; j < 3; j++)
5125             {
5126               int nelt0 = ((3 - j) * nelt) % 3;
5127               int nelt1 = ((3 - j) * nelt + 1) % 3;
5128               int nelt2 = ((3 - j) * nelt + 2) % 3;
5129               for (i = 0; i < nelt; i++)
5130                 {
5131                   if (3 * i + nelt0 < nelt)
5132                     sel[3 * i + nelt0] = j0++;
5133                   if (3 * i + nelt1 < nelt)
5134                     sel[3 * i + nelt1] = nelt + j1++;
5135                   if (3 * i + nelt2 < nelt)
5136                     sel[3 * i + nelt2] = 0;
5137                 }
5138               indices.new_vector (sel, 2, nelt);
5139               if (!can_vec_perm_const_p (mode, indices))
5140                 {
5141                   if (dump_enabled_p ())
5142                     dump_printf (MSG_MISSED_OPTIMIZATION,
5143                                  "permutation op not supported by target.\n");
5144                   return false;
5145                 }
5146
5147               for (i = 0; i < nelt; i++)
5148                 {
5149                   if (3 * i + nelt0 < nelt)
5150                     sel[3 * i + nelt0] = 3 * i + nelt0;
5151                   if (3 * i + nelt1 < nelt)
5152                     sel[3 * i + nelt1] = 3 * i + nelt1;
5153                   if (3 * i + nelt2 < nelt)
5154                     sel[3 * i + nelt2] = nelt + j2++;
5155                 }
5156               indices.new_vector (sel, 2, nelt);
5157               if (!can_vec_perm_const_p (mode, indices))
5158                 {
5159                   if (dump_enabled_p ())
5160                     dump_printf (MSG_MISSED_OPTIMIZATION,
5161                                  "permutation op not supported by target.\n");
5162                   return false;
5163                 }
5164             }
5165           return true;
5166         }
5167       else
5168         {
5169           /* If length is not equal to 3 then only power of 2 is supported.  */
5170           gcc_assert (pow2p_hwi (count));
5171           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5172
5173           /* The encoding has 2 interleaved stepped patterns.  */
5174           vec_perm_builder sel (nelt, 2, 3);
5175           sel.quick_grow (6);
5176           for (i = 0; i < 3; i++)
5177             {
5178               sel[i * 2] = i;
5179               sel[i * 2 + 1] = i + nelt;
5180             }
5181           vec_perm_indices indices (sel, 2, nelt);
5182           if (can_vec_perm_const_p (mode, indices))
5183             {
5184               for (i = 0; i < 6; i++)
5185                 sel[i] += exact_div (nelt, 2);
5186               indices.new_vector (sel, 2, nelt);
5187               if (can_vec_perm_const_p (mode, indices))
5188                 return true;
5189             }
5190         }
5191     }
5192
5193   if (dump_enabled_p ())
5194     dump_printf (MSG_MISSED_OPTIMIZATION,
5195                  "permutation op not supported by target.\n");
5196   return false;
5197 }
5198
5199
5200 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5201    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5202
5203 bool
5204 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5205                             bool masked_p)
5206 {
5207   if (masked_p)
5208     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5209                                          vec_mask_store_lanes_optab,
5210                                          vectype, count);
5211   else
5212     return vect_lanes_optab_supported_p ("vec_store_lanes",
5213                                          vec_store_lanes_optab,
5214                                          vectype, count);
5215 }
5216
5217
5218 /* Function vect_permute_store_chain.
5219
5220    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5221    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5222    the data correctly for the stores.  Return the final references for stores
5223    in RESULT_CHAIN.
5224
5225    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5226    The input is 4 vectors each containing 8 elements.  We assign a number to
5227    each element, the input sequence is:
5228
5229    1st vec:   0  1  2  3  4  5  6  7
5230    2nd vec:   8  9 10 11 12 13 14 15
5231    3rd vec:  16 17 18 19 20 21 22 23
5232    4th vec:  24 25 26 27 28 29 30 31
5233
5234    The output sequence should be:
5235
5236    1st vec:  0  8 16 24  1  9 17 25
5237    2nd vec:  2 10 18 26  3 11 19 27
5238    3rd vec:  4 12 20 28  5 13 21 30
5239    4th vec:  6 14 22 30  7 15 23 31
5240
5241    i.e., we interleave the contents of the four vectors in their order.
5242
5243    We use interleave_high/low instructions to create such output.  The input of
5244    each interleave_high/low operation is two vectors:
5245    1st vec    2nd vec
5246    0 1 2 3    4 5 6 7
5247    the even elements of the result vector are obtained left-to-right from the
5248    high/low elements of the first vector.  The odd elements of the result are
5249    obtained left-to-right from the high/low elements of the second vector.
5250    The output of interleave_high will be:   0 4 1 5
5251    and of interleave_low:                   2 6 3 7
5252
5253
5254    The permutation is done in log LENGTH stages.  In each stage interleave_high
5255    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5256    where the first argument is taken from the first half of DR_CHAIN and the
5257    second argument from it's second half.
5258    In our example,
5259
5260    I1: interleave_high (1st vec, 3rd vec)
5261    I2: interleave_low (1st vec, 3rd vec)
5262    I3: interleave_high (2nd vec, 4th vec)
5263    I4: interleave_low (2nd vec, 4th vec)
5264
5265    The output for the first stage is:
5266
5267    I1:  0 16  1 17  2 18  3 19
5268    I2:  4 20  5 21  6 22  7 23
5269    I3:  8 24  9 25 10 26 11 27
5270    I4: 12 28 13 29 14 30 15 31
5271
5272    The output of the second stage, i.e. the final result is:
5273
5274    I1:  0  8 16 24  1  9 17 25
5275    I2:  2 10 18 26  3 11 19 27
5276    I3:  4 12 20 28  5 13 21 30
5277    I4:  6 14 22 30  7 15 23 31.  */
5278
5279 void
5280 vect_permute_store_chain (vec<tree> dr_chain,
5281                           unsigned int length,
5282                           stmt_vec_info stmt_info,
5283                           gimple_stmt_iterator *gsi,
5284                           vec<tree> *result_chain)
5285 {
5286   tree vect1, vect2, high, low;
5287   gimple *perm_stmt;
5288   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5289   tree perm_mask_low, perm_mask_high;
5290   tree data_ref;
5291   tree perm3_mask_low, perm3_mask_high;
5292   unsigned int i, j, n, log_length = exact_log2 (length);
5293
5294   result_chain->quick_grow (length);
5295   memcpy (result_chain->address (), dr_chain.address (),
5296           length * sizeof (tree));
5297
5298   if (length == 3)
5299     {
5300       /* vect_grouped_store_supported ensures that this is constant.  */
5301       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5302       unsigned int j0 = 0, j1 = 0, j2 = 0;
5303
5304       vec_perm_builder sel (nelt, nelt, 1);
5305       sel.quick_grow (nelt);
5306       vec_perm_indices indices;
5307       for (j = 0; j < 3; j++)
5308         {
5309           int nelt0 = ((3 - j) * nelt) % 3;
5310           int nelt1 = ((3 - j) * nelt + 1) % 3;
5311           int nelt2 = ((3 - j) * nelt + 2) % 3;
5312
5313           for (i = 0; i < nelt; i++)
5314             {
5315               if (3 * i + nelt0 < nelt)
5316                 sel[3 * i + nelt0] = j0++;
5317               if (3 * i + nelt1 < nelt)
5318                 sel[3 * i + nelt1] = nelt + j1++;
5319               if (3 * i + nelt2 < nelt)
5320                 sel[3 * i + nelt2] = 0;
5321             }
5322           indices.new_vector (sel, 2, nelt);
5323           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5324
5325           for (i = 0; i < nelt; i++)
5326             {
5327               if (3 * i + nelt0 < nelt)
5328                 sel[3 * i + nelt0] = 3 * i + nelt0;
5329               if (3 * i + nelt1 < nelt)
5330                 sel[3 * i + nelt1] = 3 * i + nelt1;
5331               if (3 * i + nelt2 < nelt)
5332                 sel[3 * i + nelt2] = nelt + j2++;
5333             }
5334           indices.new_vector (sel, 2, nelt);
5335           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5336
5337           vect1 = dr_chain[0];
5338           vect2 = dr_chain[1];
5339
5340           /* Create interleaving stmt:
5341              low = VEC_PERM_EXPR <vect1, vect2,
5342                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5343                                    j + 2, nelt + j + 2, *, ...}>  */
5344           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5345           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5346                                            vect2, perm3_mask_low);
5347           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5348
5349           vect1 = data_ref;
5350           vect2 = dr_chain[2];
5351           /* Create interleaving stmt:
5352              low = VEC_PERM_EXPR <vect1, vect2,
5353                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5354                                    6, 7, nelt + j + 2, ...}>  */
5355           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5356           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5357                                            vect2, perm3_mask_high);
5358           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5359           (*result_chain)[j] = data_ref;
5360         }
5361     }
5362   else
5363     {
5364       /* If length is not equal to 3 then only power of 2 is supported.  */
5365       gcc_assert (pow2p_hwi (length));
5366
5367       /* The encoding has 2 interleaved stepped patterns.  */
5368       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5369       vec_perm_builder sel (nelt, 2, 3);
5370       sel.quick_grow (6);
5371       for (i = 0; i < 3; i++)
5372         {
5373           sel[i * 2] = i;
5374           sel[i * 2 + 1] = i + nelt;
5375         }
5376         vec_perm_indices indices (sel, 2, nelt);
5377         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5378
5379         for (i = 0; i < 6; i++)
5380           sel[i] += exact_div (nelt, 2);
5381         indices.new_vector (sel, 2, nelt);
5382         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5383
5384         for (i = 0, n = log_length; i < n; i++)
5385           {
5386             for (j = 0; j < length/2; j++)
5387               {
5388                 vect1 = dr_chain[j];
5389                 vect2 = dr_chain[j+length/2];
5390
5391                 /* Create interleaving stmt:
5392                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5393                                                         ...}>  */
5394                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5395                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5396                                                  vect2, perm_mask_high);
5397                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5398                 (*result_chain)[2*j] = high;
5399
5400                 /* Create interleaving stmt:
5401                    low = VEC_PERM_EXPR <vect1, vect2,
5402                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5403                                          ...}>  */
5404                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5405                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5406                                                  vect2, perm_mask_low);
5407                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5408                 (*result_chain)[2*j+1] = low;
5409               }
5410             memcpy (dr_chain.address (), result_chain->address (),
5411                     length * sizeof (tree));
5412           }
5413     }
5414 }
5415
5416 /* Function vect_setup_realignment
5417
5418    This function is called when vectorizing an unaligned load using
5419    the dr_explicit_realign[_optimized] scheme.
5420    This function generates the following code at the loop prolog:
5421
5422       p = initial_addr;
5423    x  msq_init = *(floor(p));   # prolog load
5424       realignment_token = call target_builtin;
5425     loop:
5426    x  msq = phi (msq_init, ---)
5427
5428    The stmts marked with x are generated only for the case of
5429    dr_explicit_realign_optimized.
5430
5431    The code above sets up a new (vector) pointer, pointing to the first
5432    location accessed by STMT_INFO, and a "floor-aligned" load using that
5433    pointer.  It also generates code to compute the "realignment-token"
5434    (if the relevant target hook was defined), and creates a phi-node at the
5435    loop-header bb whose arguments are the result of the prolog-load (created
5436    by this function) and the result of a load that takes place in the loop
5437    (to be created by the caller to this function).
5438
5439    For the case of dr_explicit_realign_optimized:
5440    The caller to this function uses the phi-result (msq) to create the
5441    realignment code inside the loop, and sets up the missing phi argument,
5442    as follows:
5443     loop:
5444       msq = phi (msq_init, lsq)
5445       lsq = *(floor(p'));        # load in loop
5446       result = realign_load (msq, lsq, realignment_token);
5447
5448    For the case of dr_explicit_realign:
5449     loop:
5450       msq = *(floor(p));        # load in loop
5451       p' = p + (VS-1);
5452       lsq = *(floor(p'));       # load in loop
5453       result = realign_load (msq, lsq, realignment_token);
5454
5455    Input:
5456    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5457                a memory location that may be unaligned.
5458    BSI - place where new code is to be inserted.
5459    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5460                               is used.
5461
5462    Output:
5463    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5464                        target hook, if defined.
5465    Return value - the result of the loop-header phi node.  */
5466
5467 tree
5468 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5469                         tree *realignment_token,
5470                         enum dr_alignment_support alignment_support_scheme,
5471                         tree init_addr,
5472                         class loop **at_loop)
5473 {
5474   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5475   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5476   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5477   struct data_reference *dr = dr_info->dr;
5478   class loop *loop = NULL;
5479   edge pe = NULL;
5480   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5481   tree vec_dest;
5482   gimple *inc;
5483   tree ptr;
5484   tree data_ref;
5485   basic_block new_bb;
5486   tree msq_init = NULL_TREE;
5487   tree new_temp;
5488   gphi *phi_stmt;
5489   tree msq = NULL_TREE;
5490   gimple_seq stmts = NULL;
5491   bool compute_in_loop = false;
5492   bool nested_in_vect_loop = false;
5493   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5494   class loop *loop_for_initial_load = NULL;
5495
5496   if (loop_vinfo)
5497     {
5498       loop = LOOP_VINFO_LOOP (loop_vinfo);
5499       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5500     }
5501
5502   gcc_assert (alignment_support_scheme == dr_explicit_realign
5503               || alignment_support_scheme == dr_explicit_realign_optimized);
5504
5505   /* We need to generate three things:
5506      1. the misalignment computation
5507      2. the extra vector load (for the optimized realignment scheme).
5508      3. the phi node for the two vectors from which the realignment is
5509       done (for the optimized realignment scheme).  */
5510
5511   /* 1. Determine where to generate the misalignment computation.
5512
5513      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5514      calculation will be generated by this function, outside the loop (in the
5515      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5516      caller, inside the loop.
5517
5518      Background: If the misalignment remains fixed throughout the iterations of
5519      the loop, then both realignment schemes are applicable, and also the
5520      misalignment computation can be done outside LOOP.  This is because we are
5521      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5522      are a multiple of VS (the Vector Size), and therefore the misalignment in
5523      different vectorized LOOP iterations is always the same.
5524      The problem arises only if the memory access is in an inner-loop nested
5525      inside LOOP, which is now being vectorized using outer-loop vectorization.
5526      This is the only case when the misalignment of the memory access may not
5527      remain fixed throughout the iterations of the inner-loop (as explained in
5528      detail in vect_supportable_dr_alignment).  In this case, not only is the
5529      optimized realignment scheme not applicable, but also the misalignment
5530      computation (and generation of the realignment token that is passed to
5531      REALIGN_LOAD) have to be done inside the loop.
5532
5533      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5534      or not, which in turn determines if the misalignment is computed inside
5535      the inner-loop, or outside LOOP.  */
5536
5537   if (init_addr != NULL_TREE || !loop_vinfo)
5538     {
5539       compute_in_loop = true;
5540       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5541     }
5542
5543
5544   /* 2. Determine where to generate the extra vector load.
5545
5546      For the optimized realignment scheme, instead of generating two vector
5547      loads in each iteration, we generate a single extra vector load in the
5548      preheader of the loop, and in each iteration reuse the result of the
5549      vector load from the previous iteration.  In case the memory access is in
5550      an inner-loop nested inside LOOP, which is now being vectorized using
5551      outer-loop vectorization, we need to determine whether this initial vector
5552      load should be generated at the preheader of the inner-loop, or can be
5553      generated at the preheader of LOOP.  If the memory access has no evolution
5554      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5555      to be generated inside LOOP (in the preheader of the inner-loop).  */
5556
5557   if (nested_in_vect_loop)
5558     {
5559       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5560       bool invariant_in_outerloop =
5561             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5562       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5563     }
5564   else
5565     loop_for_initial_load = loop;
5566   if (at_loop)
5567     *at_loop = loop_for_initial_load;
5568
5569   if (loop_for_initial_load)
5570     pe = loop_preheader_edge (loop_for_initial_load);
5571
5572   /* 3. For the case of the optimized realignment, create the first vector
5573       load at the loop preheader.  */
5574
5575   if (alignment_support_scheme == dr_explicit_realign_optimized)
5576     {
5577       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5578       gassign *new_stmt;
5579
5580       gcc_assert (!compute_in_loop);
5581       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5582       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5583                                       loop_for_initial_load, NULL_TREE,
5584                                       &init_addr, NULL, &inc, true);
5585       if (TREE_CODE (ptr) == SSA_NAME)
5586         new_temp = copy_ssa_name (ptr);
5587       else
5588         new_temp = make_ssa_name (TREE_TYPE (ptr));
5589       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5590       tree type = TREE_TYPE (ptr);
5591       new_stmt = gimple_build_assign
5592                    (new_temp, BIT_AND_EXPR, ptr,
5593                     fold_build2 (MINUS_EXPR, type,
5594                                  build_int_cst (type, 0),
5595                                  build_int_cst (type, align)));
5596       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5597       gcc_assert (!new_bb);
5598       data_ref
5599         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5600                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5601       vect_copy_ref_info (data_ref, DR_REF (dr));
5602       new_stmt = gimple_build_assign (vec_dest, data_ref);
5603       new_temp = make_ssa_name (vec_dest, new_stmt);
5604       gimple_assign_set_lhs (new_stmt, new_temp);
5605       if (pe)
5606         {
5607           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5608           gcc_assert (!new_bb);
5609         }
5610       else
5611          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5612
5613       msq_init = gimple_assign_lhs (new_stmt);
5614     }
5615
5616   /* 4. Create realignment token using a target builtin, if available.
5617       It is done either inside the containing loop, or before LOOP (as
5618       determined above).  */
5619
5620   if (targetm.vectorize.builtin_mask_for_load)
5621     {
5622       gcall *new_stmt;
5623       tree builtin_decl;
5624
5625       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5626       if (!init_addr)
5627         {
5628           /* Generate the INIT_ADDR computation outside LOOP.  */
5629           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5630                                                             NULL_TREE);
5631           if (loop)
5632             {
5633               pe = loop_preheader_edge (loop);
5634               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5635               gcc_assert (!new_bb);
5636             }
5637           else
5638              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5639         }
5640
5641       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5642       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5643       vec_dest =
5644         vect_create_destination_var (scalar_dest,
5645                                      gimple_call_return_type (new_stmt));
5646       new_temp = make_ssa_name (vec_dest, new_stmt);
5647       gimple_call_set_lhs (new_stmt, new_temp);
5648
5649       if (compute_in_loop)
5650         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5651       else
5652         {
5653           /* Generate the misalignment computation outside LOOP.  */
5654           pe = loop_preheader_edge (loop);
5655           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5656           gcc_assert (!new_bb);
5657         }
5658
5659       *realignment_token = gimple_call_lhs (new_stmt);
5660
5661       /* The result of the CALL_EXPR to this builtin is determined from
5662          the value of the parameter and no global variables are touched
5663          which makes the builtin a "const" function.  Requiring the
5664          builtin to have the "const" attribute makes it unnecessary
5665          to call mark_call_clobbered.  */
5666       gcc_assert (TREE_READONLY (builtin_decl));
5667     }
5668
5669   if (alignment_support_scheme == dr_explicit_realign)
5670     return msq;
5671
5672   gcc_assert (!compute_in_loop);
5673   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5674
5675
5676   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5677
5678   pe = loop_preheader_edge (containing_loop);
5679   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5680   msq = make_ssa_name (vec_dest);
5681   phi_stmt = create_phi_node (msq, containing_loop->header);
5682   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5683
5684   return msq;
5685 }
5686
5687
5688 /* Function vect_grouped_load_supported.
5689
5690    COUNT is the size of the load group (the number of statements plus the
5691    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5692    only one statement, with a gap of COUNT - 1.
5693
5694    Returns true if a suitable permute exists.  */
5695
5696 bool
5697 vect_grouped_load_supported (tree vectype, bool single_element_p,
5698                              unsigned HOST_WIDE_INT count)
5699 {
5700   machine_mode mode = TYPE_MODE (vectype);
5701
5702   /* If this is single-element interleaving with an element distance
5703      that leaves unused vector loads around punt - we at least create
5704      very sub-optimal code in that case (and blow up memory,
5705      see PR65518).  */
5706   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5707     {
5708       if (dump_enabled_p ())
5709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5710                          "single-element interleaving not supported "
5711                          "for not adjacent vector loads\n");
5712       return false;
5713     }
5714
5715   /* vect_permute_load_chain requires the group size to be equal to 3 or
5716      be a power of two.  */
5717   if (count != 3 && exact_log2 (count) == -1)
5718     {
5719       if (dump_enabled_p ())
5720         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5721                          "the size of the group of accesses"
5722                          " is not a power of 2 or not equal to 3\n");
5723       return false;
5724     }
5725
5726   /* Check that the permutation is supported.  */
5727   if (VECTOR_MODE_P (mode))
5728     {
5729       unsigned int i, j;
5730       if (count == 3)
5731         {
5732           unsigned int nelt;
5733           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5734             {
5735               if (dump_enabled_p ())
5736                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5737                                  "cannot handle groups of 3 loads for"
5738                                  " variable-length vectors\n");
5739               return false;
5740             }
5741
5742           vec_perm_builder sel (nelt, nelt, 1);
5743           sel.quick_grow (nelt);
5744           vec_perm_indices indices;
5745           unsigned int k;
5746           for (k = 0; k < 3; k++)
5747             {
5748               for (i = 0; i < nelt; i++)
5749                 if (3 * i + k < 2 * nelt)
5750                   sel[i] = 3 * i + k;
5751                 else
5752                   sel[i] = 0;
5753               indices.new_vector (sel, 2, nelt);
5754               if (!can_vec_perm_const_p (mode, indices))
5755                 {
5756                   if (dump_enabled_p ())
5757                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758                                      "shuffle of 3 loads is not supported by"
5759                                      " target\n");
5760                   return false;
5761                 }
5762               for (i = 0, j = 0; i < nelt; i++)
5763                 if (3 * i + k < 2 * nelt)
5764                   sel[i] = i;
5765                 else
5766                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5767               indices.new_vector (sel, 2, nelt);
5768               if (!can_vec_perm_const_p (mode, indices))
5769                 {
5770                   if (dump_enabled_p ())
5771                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5772                                      "shuffle of 3 loads is not supported by"
5773                                      " target\n");
5774                   return false;
5775                 }
5776             }
5777           return true;
5778         }
5779       else
5780         {
5781           /* If length is not equal to 3 then only power of 2 is supported.  */
5782           gcc_assert (pow2p_hwi (count));
5783           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5784
5785           /* The encoding has a single stepped pattern.  */
5786           vec_perm_builder sel (nelt, 1, 3);
5787           sel.quick_grow (3);
5788           for (i = 0; i < 3; i++)
5789             sel[i] = i * 2;
5790           vec_perm_indices indices (sel, 2, nelt);
5791           if (can_vec_perm_const_p (mode, indices))
5792             {
5793               for (i = 0; i < 3; i++)
5794                 sel[i] = i * 2 + 1;
5795               indices.new_vector (sel, 2, nelt);
5796               if (can_vec_perm_const_p (mode, indices))
5797                 return true;
5798             }
5799         }
5800     }
5801
5802   if (dump_enabled_p ())
5803     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5804                      "extract even/odd not supported by target\n");
5805   return false;
5806 }
5807
5808 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5809    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5810
5811 bool
5812 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5813                            bool masked_p)
5814 {
5815   if (masked_p)
5816     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5817                                          vec_mask_load_lanes_optab,
5818                                          vectype, count);
5819   else
5820     return vect_lanes_optab_supported_p ("vec_load_lanes",
5821                                          vec_load_lanes_optab,
5822                                          vectype, count);
5823 }
5824
5825 /* Function vect_permute_load_chain.
5826
5827    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5828    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5829    the input data correctly.  Return the final references for loads in
5830    RESULT_CHAIN.
5831
5832    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5833    The input is 4 vectors each containing 8 elements. We assign a number to each
5834    element, the input sequence is:
5835
5836    1st vec:   0  1  2  3  4  5  6  7
5837    2nd vec:   8  9 10 11 12 13 14 15
5838    3rd vec:  16 17 18 19 20 21 22 23
5839    4th vec:  24 25 26 27 28 29 30 31
5840
5841    The output sequence should be:
5842
5843    1st vec:  0 4  8 12 16 20 24 28
5844    2nd vec:  1 5  9 13 17 21 25 29
5845    3rd vec:  2 6 10 14 18 22 26 30
5846    4th vec:  3 7 11 15 19 23 27 31
5847
5848    i.e., the first output vector should contain the first elements of each
5849    interleaving group, etc.
5850
5851    We use extract_even/odd instructions to create such output.  The input of
5852    each extract_even/odd operation is two vectors
5853    1st vec    2nd vec
5854    0 1 2 3    4 5 6 7
5855
5856    and the output is the vector of extracted even/odd elements.  The output of
5857    extract_even will be:   0 2 4 6
5858    and of extract_odd:     1 3 5 7
5859
5860
5861    The permutation is done in log LENGTH stages.  In each stage extract_even
5862    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5863    their order.  In our example,
5864
5865    E1: extract_even (1st vec, 2nd vec)
5866    E2: extract_odd (1st vec, 2nd vec)
5867    E3: extract_even (3rd vec, 4th vec)
5868    E4: extract_odd (3rd vec, 4th vec)
5869
5870    The output for the first stage will be:
5871
5872    E1:  0  2  4  6  8 10 12 14
5873    E2:  1  3  5  7  9 11 13 15
5874    E3: 16 18 20 22 24 26 28 30
5875    E4: 17 19 21 23 25 27 29 31
5876
5877    In order to proceed and create the correct sequence for the next stage (or
5878    for the correct output, if the second stage is the last one, as in our
5879    example), we first put the output of extract_even operation and then the
5880    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5881    The input for the second stage is:
5882
5883    1st vec (E1):  0  2  4  6  8 10 12 14
5884    2nd vec (E3): 16 18 20 22 24 26 28 30
5885    3rd vec (E2):  1  3  5  7  9 11 13 15
5886    4th vec (E4): 17 19 21 23 25 27 29 31
5887
5888    The output of the second stage:
5889
5890    E1: 0 4  8 12 16 20 24 28
5891    E2: 2 6 10 14 18 22 26 30
5892    E3: 1 5  9 13 17 21 25 29
5893    E4: 3 7 11 15 19 23 27 31
5894
5895    And RESULT_CHAIN after reordering:
5896
5897    1st vec (E1):  0 4  8 12 16 20 24 28
5898    2nd vec (E3):  1 5  9 13 17 21 25 29
5899    3rd vec (E2):  2 6 10 14 18 22 26 30
5900    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5901
5902 static void
5903 vect_permute_load_chain (vec<tree> dr_chain,
5904                          unsigned int length,
5905                          stmt_vec_info stmt_info,
5906                          gimple_stmt_iterator *gsi,
5907                          vec<tree> *result_chain)
5908 {
5909   tree data_ref, first_vect, second_vect;
5910   tree perm_mask_even, perm_mask_odd;
5911   tree perm3_mask_low, perm3_mask_high;
5912   gimple *perm_stmt;
5913   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5914   unsigned int i, j, log_length = exact_log2 (length);
5915
5916   result_chain->quick_grow (length);
5917   memcpy (result_chain->address (), dr_chain.address (),
5918           length * sizeof (tree));
5919
5920   if (length == 3)
5921     {
5922       /* vect_grouped_load_supported ensures that this is constant.  */
5923       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5924       unsigned int k;
5925
5926       vec_perm_builder sel (nelt, nelt, 1);
5927       sel.quick_grow (nelt);
5928       vec_perm_indices indices;
5929       for (k = 0; k < 3; k++)
5930         {
5931           for (i = 0; i < nelt; i++)
5932             if (3 * i + k < 2 * nelt)
5933               sel[i] = 3 * i + k;
5934             else
5935               sel[i] = 0;
5936           indices.new_vector (sel, 2, nelt);
5937           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5938
5939           for (i = 0, j = 0; i < nelt; i++)
5940             if (3 * i + k < 2 * nelt)
5941               sel[i] = i;
5942             else
5943               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5944           indices.new_vector (sel, 2, nelt);
5945           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5946
5947           first_vect = dr_chain[0];
5948           second_vect = dr_chain[1];
5949
5950           /* Create interleaving stmt (low part of):
5951              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5952                                                              ...}>  */
5953           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5954           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5955                                            second_vect, perm3_mask_low);
5956           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5957
5958           /* Create interleaving stmt (high part of):
5959              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5960                                                               ...}>  */
5961           first_vect = data_ref;
5962           second_vect = dr_chain[2];
5963           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5964           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5965                                            second_vect, perm3_mask_high);
5966           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5967           (*result_chain)[k] = data_ref;
5968         }
5969     }
5970   else
5971     {
5972       /* If length is not equal to 3 then only power of 2 is supported.  */
5973       gcc_assert (pow2p_hwi (length));
5974
5975       /* The encoding has a single stepped pattern.  */
5976       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5977       vec_perm_builder sel (nelt, 1, 3);
5978       sel.quick_grow (3);
5979       for (i = 0; i < 3; ++i)
5980         sel[i] = i * 2;
5981       vec_perm_indices indices (sel, 2, nelt);
5982       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5983
5984       for (i = 0; i < 3; ++i)
5985         sel[i] = i * 2 + 1;
5986       indices.new_vector (sel, 2, nelt);
5987       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5988
5989       for (i = 0; i < log_length; i++)
5990         {
5991           for (j = 0; j < length; j += 2)
5992             {
5993               first_vect = dr_chain[j];
5994               second_vect = dr_chain[j+1];
5995
5996               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5997               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5998               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5999                                                first_vect, second_vect,
6000                                                perm_mask_even);
6001               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6002               (*result_chain)[j/2] = data_ref;
6003
6004               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6005               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6006               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6007                                                first_vect, second_vect,
6008                                                perm_mask_odd);
6009               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6010               (*result_chain)[j/2+length/2] = data_ref;
6011             }
6012           memcpy (dr_chain.address (), result_chain->address (),
6013                   length * sizeof (tree));
6014         }
6015     }
6016 }
6017
6018 /* Function vect_shift_permute_load_chain.
6019
6020    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6021    sequence of stmts to reorder the input data accordingly.
6022    Return the final references for loads in RESULT_CHAIN.
6023    Return true if successed, false otherwise.
6024
6025    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6026    The input is 3 vectors each containing 8 elements.  We assign a
6027    number to each element, the input sequence is:
6028
6029    1st vec:   0  1  2  3  4  5  6  7
6030    2nd vec:   8  9 10 11 12 13 14 15
6031    3rd vec:  16 17 18 19 20 21 22 23
6032
6033    The output sequence should be:
6034
6035    1st vec:  0 3 6  9 12 15 18 21
6036    2nd vec:  1 4 7 10 13 16 19 22
6037    3rd vec:  2 5 8 11 14 17 20 23
6038
6039    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6040
6041    First we shuffle all 3 vectors to get correct elements order:
6042
6043    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6044    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6045    3rd vec:  (16 19 22) (17 20 23) (18 21)
6046
6047    Next we unite and shift vector 3 times:
6048
6049    1st step:
6050      shift right by 6 the concatenation of:
6051      "1st vec" and  "2nd vec"
6052        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6053      "2nd vec" and  "3rd vec"
6054        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6055      "3rd vec" and  "1st vec"
6056        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6057                              | New vectors                   |
6058
6059      So that now new vectors are:
6060
6061      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6062      2nd vec:  (10 13) (16 19 22) (17 20 23)
6063      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6064
6065    2nd step:
6066      shift right by 5 the concatenation of:
6067      "1st vec" and  "3rd vec"
6068        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6069      "2nd vec" and  "1st vec"
6070        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6071      "3rd vec" and  "2nd vec"
6072        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6073                           | New vectors                   |
6074
6075      So that now new vectors are:
6076
6077      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6078      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6079      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6080
6081    3rd step:
6082      shift right by 5 the concatenation of:
6083      "1st vec" and  "1st vec"
6084        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6085      shift right by 3 the concatenation of:
6086      "2nd vec" and  "2nd vec"
6087                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6088                           | New vectors                   |
6089
6090      So that now all vectors are READY:
6091      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6092      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6093      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6094
6095    This algorithm is faster than one in vect_permute_load_chain if:
6096      1.  "shift of a concatination" is faster than general permutation.
6097          This is usually so.
6098      2.  The TARGET machine can't execute vector instructions in parallel.
6099          This is because each step of the algorithm depends on previous.
6100          The algorithm in vect_permute_load_chain is much more parallel.
6101
6102    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6103 */
6104
6105 static bool
6106 vect_shift_permute_load_chain (vec<tree> dr_chain,
6107                                unsigned int length,
6108                                stmt_vec_info stmt_info,
6109                                gimple_stmt_iterator *gsi,
6110                                vec<tree> *result_chain)
6111 {
6112   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6113   tree perm2_mask1, perm2_mask2, perm3_mask;
6114   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6115   gimple *perm_stmt;
6116
6117   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6118   unsigned int i;
6119   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6120
6121   unsigned HOST_WIDE_INT nelt, vf;
6122   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6123       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6124     /* Not supported for variable-length vectors.  */
6125     return false;
6126
6127   vec_perm_builder sel (nelt, nelt, 1);
6128   sel.quick_grow (nelt);
6129
6130   result_chain->quick_grow (length);
6131   memcpy (result_chain->address (), dr_chain.address (),
6132           length * sizeof (tree));
6133
6134   if (pow2p_hwi (length) && vf > 4)
6135     {
6136       unsigned int j, log_length = exact_log2 (length);
6137       for (i = 0; i < nelt / 2; ++i)
6138         sel[i] = i * 2;
6139       for (i = 0; i < nelt / 2; ++i)
6140         sel[nelt / 2 + i] = i * 2 + 1;
6141       vec_perm_indices indices (sel, 2, nelt);
6142       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6143         {
6144           if (dump_enabled_p ())
6145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6146                              "shuffle of 2 fields structure is not \
6147                               supported by target\n");
6148           return false;
6149         }
6150       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6151
6152       for (i = 0; i < nelt / 2; ++i)
6153         sel[i] = i * 2 + 1;
6154       for (i = 0; i < nelt / 2; ++i)
6155         sel[nelt / 2 + i] = i * 2;
6156       indices.new_vector (sel, 2, nelt);
6157       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6158         {
6159           if (dump_enabled_p ())
6160             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6161                              "shuffle of 2 fields structure is not \
6162                               supported by target\n");
6163           return false;
6164         }
6165       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6166
6167       /* Generating permutation constant to shift all elements.
6168          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6169       for (i = 0; i < nelt; i++)
6170         sel[i] = nelt / 2 + i;
6171       indices.new_vector (sel, 2, nelt);
6172       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6173         {
6174           if (dump_enabled_p ())
6175             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6176                              "shift permutation is not supported by target\n");
6177           return false;
6178         }
6179       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6180
6181       /* Generating permutation constant to select vector from 2.
6182          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6183       for (i = 0; i < nelt / 2; i++)
6184         sel[i] = i;
6185       for (i = nelt / 2; i < nelt; i++)
6186         sel[i] = nelt + i;
6187       indices.new_vector (sel, 2, nelt);
6188       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6189         {
6190           if (dump_enabled_p ())
6191             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6192                              "select is not supported by target\n");
6193           return false;
6194         }
6195       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6196
6197       for (i = 0; i < log_length; i++)
6198         {
6199           for (j = 0; j < length; j += 2)
6200             {
6201               first_vect = dr_chain[j];
6202               second_vect = dr_chain[j + 1];
6203
6204               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6205               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6206                                                first_vect, first_vect,
6207                                                perm2_mask1);
6208               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6209               vect[0] = data_ref;
6210
6211               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6212               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6213                                                second_vect, second_vect,
6214                                                perm2_mask2);
6215               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6216               vect[1] = data_ref;
6217
6218               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6219               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6220                                                vect[0], vect[1], shift1_mask);
6221               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6222               (*result_chain)[j/2 + length/2] = data_ref;
6223
6224               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6225               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6226                                                vect[0], vect[1], select_mask);
6227               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6228               (*result_chain)[j/2] = data_ref;
6229             }
6230           memcpy (dr_chain.address (), result_chain->address (),
6231                   length * sizeof (tree));
6232         }
6233       return true;
6234     }
6235   if (length == 3 && vf > 2)
6236     {
6237       unsigned int k = 0, l = 0;
6238
6239       /* Generating permutation constant to get all elements in rigth order.
6240          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6241       for (i = 0; i < nelt; i++)
6242         {
6243           if (3 * k + (l % 3) >= nelt)
6244             {
6245               k = 0;
6246               l += (3 - (nelt % 3));
6247             }
6248           sel[i] = 3 * k + (l % 3);
6249           k++;
6250         }
6251       vec_perm_indices indices (sel, 2, nelt);
6252       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6253         {
6254           if (dump_enabled_p ())
6255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6256                              "shuffle of 3 fields structure is not \
6257                               supported by target\n");
6258           return false;
6259         }
6260       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6261
6262       /* Generating permutation constant to shift all elements.
6263          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6264       for (i = 0; i < nelt; i++)
6265         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6266       indices.new_vector (sel, 2, nelt);
6267       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6268         {
6269           if (dump_enabled_p ())
6270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6271                              "shift permutation is not supported by target\n");
6272           return false;
6273         }
6274       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6275
6276       /* Generating permutation constant to shift all elements.
6277          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6278       for (i = 0; i < nelt; i++)
6279         sel[i] = 2 * (nelt / 3) + 1 + i;
6280       indices.new_vector (sel, 2, nelt);
6281       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6282         {
6283           if (dump_enabled_p ())
6284             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6285                              "shift permutation is not supported by target\n");
6286           return false;
6287         }
6288       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6289
6290       /* Generating permutation constant to shift all elements.
6291          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6292       for (i = 0; i < nelt; i++)
6293         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6294       indices.new_vector (sel, 2, nelt);
6295       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6296         {
6297           if (dump_enabled_p ())
6298             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6299                              "shift permutation is not supported by target\n");
6300           return false;
6301         }
6302       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6303
6304       /* Generating permutation constant to shift all elements.
6305          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6306       for (i = 0; i < nelt; i++)
6307         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6308       indices.new_vector (sel, 2, nelt);
6309       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6310         {
6311           if (dump_enabled_p ())
6312             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6313                              "shift permutation is not supported by target\n");
6314           return false;
6315         }
6316       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6317
6318       for (k = 0; k < 3; k++)
6319         {
6320           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6321           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6322                                            dr_chain[k], dr_chain[k],
6323                                            perm3_mask);
6324           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6325           vect[k] = data_ref;
6326         }
6327
6328       for (k = 0; k < 3; k++)
6329         {
6330           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6331           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6332                                            vect[k % 3], vect[(k + 1) % 3],
6333                                            shift1_mask);
6334           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6335           vect_shift[k] = data_ref;
6336         }
6337
6338       for (k = 0; k < 3; k++)
6339         {
6340           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6341           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6342                                            vect_shift[(4 - k) % 3],
6343                                            vect_shift[(3 - k) % 3],
6344                                            shift2_mask);
6345           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6346           vect[k] = data_ref;
6347         }
6348
6349       (*result_chain)[3 - (nelt % 3)] = vect[2];
6350
6351       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6352       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6353                                        vect[0], shift3_mask);
6354       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6355       (*result_chain)[nelt % 3] = data_ref;
6356
6357       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6358       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6359                                        vect[1], shift4_mask);
6360       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6361       (*result_chain)[0] = data_ref;
6362       return true;
6363     }
6364   return false;
6365 }
6366
6367 /* Function vect_transform_grouped_load.
6368
6369    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6370    to perform their permutation and ascribe the result vectorized statements to
6371    the scalar statements.
6372 */
6373
6374 void
6375 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6376                              int size, gimple_stmt_iterator *gsi)
6377 {
6378   machine_mode mode;
6379   vec<tree> result_chain = vNULL;
6380
6381   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6382      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6383      vectors, that are ready for vector computation.  */
6384   result_chain.create (size);
6385
6386   /* If reassociation width for vector type is 2 or greater target machine can
6387      execute 2 or more vector instructions in parallel.  Otherwise try to
6388      get chain for loads group using vect_shift_permute_load_chain.  */
6389   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6390   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6391       || pow2p_hwi (size)
6392       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6393                                          gsi, &result_chain))
6394     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6395   vect_record_grouped_load_vectors (stmt_info, result_chain);
6396   result_chain.release ();
6397 }
6398
6399 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6400    generated as part of the vectorization of STMT_INFO.  Assign the statement
6401    for each vector to the associated scalar statement.  */
6402
6403 void
6404 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6405                                   vec<tree> result_chain)
6406 {
6407   vec_info *vinfo = stmt_info->vinfo;
6408   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6409   unsigned int i, gap_count;
6410   tree tmp_data_ref;
6411
6412   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6413      Since we scan the chain starting from it's first node, their order
6414      corresponds the order of data-refs in RESULT_CHAIN.  */
6415   stmt_vec_info next_stmt_info = first_stmt_info;
6416   gap_count = 1;
6417   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6418     {
6419       if (!next_stmt_info)
6420         break;
6421
6422       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6423        code elimination pass later.  No need to check for the first stmt in
6424        the group, since it always exists.
6425        DR_GROUP_GAP is the number of steps in elements from the previous
6426        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6427        correspond to the gaps.  */
6428       if (next_stmt_info != first_stmt_info
6429           && gap_count < DR_GROUP_GAP (next_stmt_info))
6430         {
6431           gap_count++;
6432           continue;
6433         }
6434
6435       /* ???  The following needs cleanup after the removal of
6436          DR_GROUP_SAME_DR_STMT.  */
6437       if (next_stmt_info)
6438         {
6439           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6440           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6441              copies, and we put the new vector statement in the first available
6442              RELATED_STMT.  */
6443           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6444             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6445           else
6446             {
6447               stmt_vec_info prev_stmt_info
6448                 = STMT_VINFO_VEC_STMT (next_stmt_info);
6449               stmt_vec_info rel_stmt_info
6450                 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6451               while (rel_stmt_info)
6452                 {
6453                   prev_stmt_info = rel_stmt_info;
6454                   rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6455                 }
6456
6457               STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6458             }
6459
6460           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6461           gap_count = 1;
6462         }
6463     }
6464 }
6465
6466 /* Function vect_force_dr_alignment_p.
6467
6468    Returns whether the alignment of a DECL can be forced to be aligned
6469    on ALIGNMENT bit boundary.  */
6470
6471 bool
6472 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6473 {
6474   if (!VAR_P (decl))
6475     return false;
6476
6477   if (decl_in_symtab_p (decl)
6478       && !symtab_node::get (decl)->can_increase_alignment_p ())
6479     return false;
6480
6481   if (TREE_STATIC (decl))
6482     return (known_le (alignment,
6483                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6484   else
6485     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6486 }
6487
6488
6489 /* Return whether the data reference DR_INFO is supported with respect to its
6490    alignment.
6491    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6492    it is aligned, i.e., check if it is possible to vectorize it with different
6493    alignment.  */
6494
6495 enum dr_alignment_support
6496 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6497                                bool check_aligned_accesses)
6498 {
6499   data_reference *dr = dr_info->dr;
6500   stmt_vec_info stmt_info = dr_info->stmt;
6501   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6502   machine_mode mode = TYPE_MODE (vectype);
6503   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6504   class loop *vect_loop = NULL;
6505   bool nested_in_vect_loop = false;
6506
6507   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6508     return dr_aligned;
6509
6510   /* For now assume all conditional loads/stores support unaligned
6511      access without any special code.  */
6512   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6513     if (gimple_call_internal_p (stmt)
6514         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6515             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6516       return dr_unaligned_supported;
6517
6518   if (loop_vinfo)
6519     {
6520       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6521       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6522     }
6523
6524   /* Possibly unaligned access.  */
6525
6526   /* We can choose between using the implicit realignment scheme (generating
6527      a misaligned_move stmt) and the explicit realignment scheme (generating
6528      aligned loads with a REALIGN_LOAD).  There are two variants to the
6529      explicit realignment scheme: optimized, and unoptimized.
6530      We can optimize the realignment only if the step between consecutive
6531      vector loads is equal to the vector size.  Since the vector memory
6532      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6533      is guaranteed that the misalignment amount remains the same throughout the
6534      execution of the vectorized loop.  Therefore, we can create the
6535      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6536      at the loop preheader.
6537
6538      However, in the case of outer-loop vectorization, when vectorizing a
6539      memory access in the inner-loop nested within the LOOP that is now being
6540      vectorized, while it is guaranteed that the misalignment of the
6541      vectorized memory access will remain the same in different outer-loop
6542      iterations, it is *not* guaranteed that is will remain the same throughout
6543      the execution of the inner-loop.  This is because the inner-loop advances
6544      with the original scalar step (and not in steps of VS).  If the inner-loop
6545      step happens to be a multiple of VS, then the misalignment remains fixed
6546      and we can use the optimized realignment scheme.  For example:
6547
6548       for (i=0; i<N; i++)
6549         for (j=0; j<M; j++)
6550           s += a[i+j];
6551
6552      When vectorizing the i-loop in the above example, the step between
6553      consecutive vector loads is 1, and so the misalignment does not remain
6554      fixed across the execution of the inner-loop, and the realignment cannot
6555      be optimized (as illustrated in the following pseudo vectorized loop):
6556
6557       for (i=0; i<N; i+=4)
6558         for (j=0; j<M; j++){
6559           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6560                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6561                          // (assuming that we start from an aligned address).
6562           }
6563
6564      We therefore have to use the unoptimized realignment scheme:
6565
6566       for (i=0; i<N; i+=4)
6567           for (j=k; j<M; j+=4)
6568           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6569                            // that the misalignment of the initial address is
6570                            // 0).
6571
6572      The loop can then be vectorized as follows:
6573
6574       for (k=0; k<4; k++){
6575         rt = get_realignment_token (&vp[k]);
6576         for (i=0; i<N; i+=4){
6577           v1 = vp[i+k];
6578           for (j=k; j<M; j+=4){
6579             v2 = vp[i+j+VS-1];
6580             va = REALIGN_LOAD <v1,v2,rt>;
6581             vs += va;
6582             v1 = v2;
6583           }
6584         }
6585     } */
6586
6587   if (DR_IS_READ (dr))
6588     {
6589       bool is_packed = false;
6590       tree type = (TREE_TYPE (DR_REF (dr)));
6591
6592       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6593           && (!targetm.vectorize.builtin_mask_for_load
6594               || targetm.vectorize.builtin_mask_for_load ()))
6595         {
6596           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6597
6598           /* If we are doing SLP then the accesses need not have the
6599              same alignment, instead it depends on the SLP group size.  */
6600           if (loop_vinfo
6601               && STMT_SLP_TYPE (stmt_info)
6602               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6603                               * (DR_GROUP_SIZE
6604                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6605                               TYPE_VECTOR_SUBPARTS (vectype)))
6606             ;
6607           else if (!loop_vinfo
6608                    || (nested_in_vect_loop
6609                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6610                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6611             return dr_explicit_realign;
6612           else
6613             return dr_explicit_realign_optimized;
6614         }
6615       if (!known_alignment_for_access_p (dr_info))
6616         is_packed = not_size_aligned (DR_REF (dr));
6617
6618       if (targetm.vectorize.support_vector_misalignment
6619             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6620         /* Can't software pipeline the loads, but can at least do them.  */
6621         return dr_unaligned_supported;
6622     }
6623   else
6624     {
6625       bool is_packed = false;
6626       tree type = (TREE_TYPE (DR_REF (dr)));
6627
6628       if (!known_alignment_for_access_p (dr_info))
6629         is_packed = not_size_aligned (DR_REF (dr));
6630
6631      if (targetm.vectorize.support_vector_misalignment
6632            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6633        return dr_unaligned_supported;
6634     }
6635
6636   /* Unsupported.  */
6637   return dr_unaligned_unsupported;
6638 }